summaryrefslogtreecommitdiffstats
path: root/media/libjpeg/simd/arm
diff options
context:
space:
mode:
Diffstat (limited to 'media/libjpeg/simd/arm')
-rw-r--r--media/libjpeg/simd/arm/aarch32/jccolext-neon.c148
-rw-r--r--media/libjpeg/simd/arm/aarch32/jchuff-neon.c334
-rw-r--r--media/libjpeg/simd/arm/aarch32/jsimd.c978
-rw-r--r--media/libjpeg/simd/arm/aarch32/jsimd_neon.S1200
-rw-r--r--media/libjpeg/simd/arm/aarch64/jccolext-neon.c316
-rw-r--r--media/libjpeg/simd/arm/aarch64/jchuff-neon.c411
-rw-r--r--media/libjpeg/simd/arm/aarch64/jsimd.c1056
-rw-r--r--media/libjpeg/simd/arm/aarch64/jsimd_neon.S2254
-rw-r--r--media/libjpeg/simd/arm/align.h28
-rw-r--r--media/libjpeg/simd/arm/jccolor-neon.c160
-rw-r--r--media/libjpeg/simd/arm/jcgray-neon.c120
-rw-r--r--media/libjpeg/simd/arm/jcgryext-neon.c106
-rw-r--r--media/libjpeg/simd/arm/jchuff.h131
-rw-r--r--media/libjpeg/simd/arm/jcphuff-neon.c622
-rw-r--r--media/libjpeg/simd/arm/jcsample-neon.c192
-rw-r--r--media/libjpeg/simd/arm/jdcolext-neon.c374
-rw-r--r--media/libjpeg/simd/arm/jdcolor-neon.c142
-rw-r--r--media/libjpeg/simd/arm/jdmerge-neon.c145
-rw-r--r--media/libjpeg/simd/arm/jdmrgext-neon.c723
-rw-r--r--media/libjpeg/simd/arm/jdsample-neon.c569
-rw-r--r--media/libjpeg/simd/arm/jfdctfst-neon.c214
-rw-r--r--media/libjpeg/simd/arm/jfdctint-neon.c376
-rw-r--r--media/libjpeg/simd/arm/jidctfst-neon.c472
-rw-r--r--media/libjpeg/simd/arm/jidctint-neon.c802
-rw-r--r--media/libjpeg/simd/arm/jidctred-neon.c486
-rw-r--r--media/libjpeg/simd/arm/jquanti-neon.c193
-rw-r--r--media/libjpeg/simd/arm/neon-compat.h33
27 files changed, 12585 insertions, 0 deletions
diff --git a/media/libjpeg/simd/arm/aarch32/jccolext-neon.c b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
new file mode 100644
index 0000000000..362102d2b2
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
@@ -0,0 +1,148 @@
+/*
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * 0.16874695 = 11059 * 2^-16
+ * 0.33125305 = 21709 * 2^-16
+ * 0.50000000 = 32768 * 2^-16
+ * 0.41868592 = 27439 * 2^-16
+ * 0.08131409 = 5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ /* Pointer to RGB(X/A) input data */
+ JSAMPROW inptr;
+ /* Pointers to Y, Cb, and Cr output data */
+ JSAMPROW outptr0, outptr1, outptr2;
+ /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+
+ /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
+ const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+ const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+ const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+ const uint16x4x2_t consts = { { consts1, consts2 } };
+#endif
+ const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining > 0; cols_remaining -= 8) {
+
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 8) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ if (cols_remaining < 8) {
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+ }
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+ uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+ uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+ y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+ y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+ uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+ y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+ y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_low = scaled_128_5;
+ cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+ cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+ cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+ uint32x4_t cb_high = scaled_128_5;
+ cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+ cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+ cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_low = scaled_128_5;
+ cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+ cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+ cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+ uint32x4_t cr_high = scaled_128_5;
+ cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+ cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+ cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+ vrshrn_n_u32(y_high, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+ vshrn_n_u32(cb_high, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+ vshrn_n_u32(cr_high, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1_u8(outptr0, vmovn_u16(y_u16));
+ vst1_u8(outptr1, vmovn_u16(cb_u16));
+ vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+ /* Increment pointers. */
+ inptr += (8 * RGB_PIXELSIZE);
+ outptr0 += 8;
+ outptr1 += 8;
+ outptr2 += 8;
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jchuff-neon.c b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
new file mode 100644
index 0000000000..19d94f720d
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,334 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ uint8_t block_nbits[DCTSIZE2];
+ uint16_t block_diff[DCTSIZE2];
+
+ /* Load rows of coefficients from DCT block in zig-zag order. */
+
+ /* Compute DC coefficient difference value. (F.1.1.5.1) */
+ int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+ row0 = vld1q_lane_s16(block + 1, row0, 1);
+ row0 = vld1q_lane_s16(block + 8, row0, 2);
+ row0 = vld1q_lane_s16(block + 16, row0, 3);
+ row0 = vld1q_lane_s16(block + 9, row0, 4);
+ row0 = vld1q_lane_s16(block + 2, row0, 5);
+ row0 = vld1q_lane_s16(block + 3, row0, 6);
+ row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+ int16x8_t row1 = vld1q_dup_s16(block + 17);
+ row1 = vld1q_lane_s16(block + 24, row1, 1);
+ row1 = vld1q_lane_s16(block + 32, row1, 2);
+ row1 = vld1q_lane_s16(block + 25, row1, 3);
+ row1 = vld1q_lane_s16(block + 18, row1, 4);
+ row1 = vld1q_lane_s16(block + 11, row1, 5);
+ row1 = vld1q_lane_s16(block + 4, row1, 6);
+ row1 = vld1q_lane_s16(block + 5, row1, 7);
+
+ int16x8_t row2 = vld1q_dup_s16(block + 12);
+ row2 = vld1q_lane_s16(block + 19, row2, 1);
+ row2 = vld1q_lane_s16(block + 26, row2, 2);
+ row2 = vld1q_lane_s16(block + 33, row2, 3);
+ row2 = vld1q_lane_s16(block + 40, row2, 4);
+ row2 = vld1q_lane_s16(block + 48, row2, 5);
+ row2 = vld1q_lane_s16(block + 41, row2, 6);
+ row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+ int16x8_t row3 = vld1q_dup_s16(block + 27);
+ row3 = vld1q_lane_s16(block + 20, row3, 1);
+ row3 = vld1q_lane_s16(block + 13, row3, 2);
+ row3 = vld1q_lane_s16(block + 6, row3, 3);
+ row3 = vld1q_lane_s16(block + 7, row3, 4);
+ row3 = vld1q_lane_s16(block + 14, row3, 5);
+ row3 = vld1q_lane_s16(block + 21, row3, 6);
+ row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+ int16x8_t abs_row0 = vabsq_s16(row0);
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+
+ int16x8_t row0_lz = vclzq_s16(abs_row0);
+ int16x8_t row1_lz = vclzq_s16(abs_row1);
+ int16x8_t row2_lz = vclzq_s16(abs_row2);
+ int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+ /* Compute number of bits required to represent each coefficient. */
+ uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+ uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+ uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+ uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+ vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+ vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+ vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+ vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+ uint16x8_t row0_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+ vnegq_s16(row0_lz));
+ uint16x8_t row1_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+ vnegq_s16(row1_lz));
+ uint16x8_t row2_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+ vnegq_s16(row2_lz));
+ uint16x8_t row3_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+ vnegq_s16(row3_lz));
+
+ uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+ /* Store diff values for rows 0, 1, 2, and 3. */
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+ /* Load last four rows of coefficients from DCT block in zig-zag order. */
+ int16x8_t row4 = vld1q_dup_s16(block + 35);
+ row4 = vld1q_lane_s16(block + 42, row4, 1);
+ row4 = vld1q_lane_s16(block + 49, row4, 2);
+ row4 = vld1q_lane_s16(block + 56, row4, 3);
+ row4 = vld1q_lane_s16(block + 57, row4, 4);
+ row4 = vld1q_lane_s16(block + 50, row4, 5);
+ row4 = vld1q_lane_s16(block + 43, row4, 6);
+ row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+ int16x8_t row5 = vld1q_dup_s16(block + 29);
+ row5 = vld1q_lane_s16(block + 22, row5, 1);
+ row5 = vld1q_lane_s16(block + 15, row5, 2);
+ row5 = vld1q_lane_s16(block + 23, row5, 3);
+ row5 = vld1q_lane_s16(block + 30, row5, 4);
+ row5 = vld1q_lane_s16(block + 37, row5, 5);
+ row5 = vld1q_lane_s16(block + 44, row5, 6);
+ row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+ int16x8_t row6 = vld1q_dup_s16(block + 58);
+ row6 = vld1q_lane_s16(block + 59, row6, 1);
+ row6 = vld1q_lane_s16(block + 52, row6, 2);
+ row6 = vld1q_lane_s16(block + 45, row6, 3);
+ row6 = vld1q_lane_s16(block + 38, row6, 4);
+ row6 = vld1q_lane_s16(block + 31, row6, 5);
+ row6 = vld1q_lane_s16(block + 39, row6, 6);
+ row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+ int16x8_t row7 = vld1q_dup_s16(block + 53);
+ row7 = vld1q_lane_s16(block + 60, row7, 1);
+ row7 = vld1q_lane_s16(block + 61, row7, 2);
+ row7 = vld1q_lane_s16(block + 54, row7, 3);
+ row7 = vld1q_lane_s16(block + 47, row7, 4);
+ row7 = vld1q_lane_s16(block + 55, row7, 5);
+ row7 = vld1q_lane_s16(block + 62, row7, 6);
+ row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+
+ int16x8_t row4_lz = vclzq_s16(abs_row4);
+ int16x8_t row5_lz = vclzq_s16(abs_row5);
+ int16x8_t row6_lz = vclzq_s16(abs_row6);
+ int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+ /* Compute number of bits required to represent each coefficient. */
+ uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+ uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+ uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+ uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+ vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+ vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+ vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+ vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+ uint16x8_t row4_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+ vnegq_s16(row4_lz));
+ uint16x8_t row5_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+ vnegq_s16(row5_lz));
+ uint16x8_t row6_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+ vnegq_s16(row6_lz));
+ uint16x8_t row7_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+ vnegq_s16(row7_lz));
+
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+ /* Store diff values for rows 4, 5, 6, and 7. */
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ /* Construct bitmap to accelerate encoding of AC coefficients. A set bit
+ * means that the corresponding coefficient != 0.
+ */
+ uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+ uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+ uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+ uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+ uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+ uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+ uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+ uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+ /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+ row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+ row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+ row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+ row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+ row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+ row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+ row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+ row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+ uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+ uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+ uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+ uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+ uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+ uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+ /* Shift left to remove DC bit. */
+ bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+ /* Move bitmap to 32-bit scalar registers. */
+ uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+ uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+ /* Set up state and bit buffer for output bitstream. */
+ working_state *state_ptr = (working_state *)state;
+ int free_bits = state_ptr->cur.free_bits;
+ size_t put_buffer = state_ptr->cur.put_buffer;
+
+ /* Encode DC coefficient. */
+
+ unsigned int nbits = block_nbits[0];
+ /* Emit Huffman-coded symbol and additional diff bits. */
+ unsigned int diff = block_diff[0];
+ PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+ /* Encode AC coefficients. */
+
+ unsigned int r = 0; /* r = run length of zeros */
+ unsigned int i = 1; /* i = number of coefficients encoded */
+ /* Code and size information for a run length of 16 zero coefficients */
+ const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+ const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+ while (bitmap_1_32 != 0) {
+ r = BUILTIN_CLZ(bitmap_1_32);
+ i += r;
+ bitmap_1_32 <<= r;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap_1_32 <<= 1;
+ }
+
+ r = 33 - i;
+ i = 33;
+
+ while (bitmap_33_63 != 0) {
+ unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
+ r += leading_zeros;
+ i += leading_zeros;
+ bitmap_33_63 <<= leading_zeros;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ r = 0;
+ i++;
+ bitmap_33_63 <<= 1;
+ }
+
+ /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+ * The value of RS for the EOB code is 0.
+ */
+ if (i != 64) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
+
+ state_ptr->cur.put_buffer = put_buffer;
+ state_ptr->cur.free_bits = free_bits;
+
+ return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd.c b/media/libjpeg/simd/arm/aarch32/jsimd.c
new file mode 100644
index 0000000000..920f7656eb
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd.c
@@ -0,0 +1,978 @@
+/*
+ * jsimd_arm.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+ char *p;
+
+ if (*feature == 0)
+ return 0;
+ if (strncmp(buffer, "Features", 8) != 0)
+ return 0;
+ buffer += 8;
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'feature' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, feature))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(feature);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+
+ simd_support = 0;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_feature(buffer, "neon"))
+ simd_support |= JSIMD_NEON;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char env[2] = { 0 };
+#endif
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+#if defined(__ARM_NEON__)
+ simd_support |= JSIMD_NEON;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ /* We still have a chance to use Neon regardless of globally used
+ * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+ * /proc/cpuinfo parsing on linux/android */
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+ simd_support = JSIMD_NEON;
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+ simd_support = 0;
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+ simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_ycc_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_ycc_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_ycc_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_ycc_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_ycc_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_gray_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_gray_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_gray_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_ycc_extrgbx_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_ycc_extbgr_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_ycc_extbgrx_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_ycc_extxbgr_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_ycc_extxrgb_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+ output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON && simd_huffman)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+ jpeg_natural_order_start, Sl,
+ Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd_neon.S b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
new file mode 100644
index 0000000000..7e1e2b1451
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
@@ -0,0 +1,1200 @@
+/*
+ * Armv7 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ * All Rights Reserved.
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
+ * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.syntax unified
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+ .private_extern _\fname
+ .globl _\fname
+_\fname:
+#else
+ .global \fname
+#ifdef __ELF__
+ .hidden \fname
+ .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#define CENTERJSAMPLE 128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336 (2446)
+#define FIX_0_390180644 (3196)
+#define FIX_0_541196100 (4433)
+#define FIX_0_765366865 (6270)
+#define FIX_0_899976223 (7373)
+#define FIX_1_175875602 (9633)
+#define FIX_1_501321110 (12299)
+#define FIX_1_847759065 (15137)
+#define FIX_1_961570560 (16069)
+#define FIX_2_053119869 (16819)
+#define FIX_2_562915447 (20995)
+#define FIX_3_072711026 (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+ DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+ JLONG q1, q2, q3, q4, q5, q6, q7; \
+ JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
+ \
+ /* 1-D iDCT input data */ \
+ row0 = xrow0; \
+ row1 = xrow1; \
+ row2 = xrow2; \
+ row3 = xrow3; \
+ row4 = xrow4; \
+ row5 = xrow5; \
+ row6 = xrow6; \
+ row7 = xrow7; \
+ \
+ q5 = row7 + row3; \
+ q4 = row5 + row1; \
+ q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+ MULTIPLY(q4, FIX_1_175875602); \
+ q7 = MULTIPLY(q5, FIX_1_175875602) + \
+ MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+ q2 = MULTIPLY(row2, FIX_0_541196100) + \
+ MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+ q4 = q6; \
+ q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+ q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+ MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+ /* now we can use q1 (reloadable constants have been used up) */ \
+ q1 = q3 + q2; \
+ q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+ MULTIPLY(row1, -FIX_0_899976223); \
+ q5 = q7; \
+ q1 = q1 + q6; \
+ q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+ MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+ \
+ /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+ tmp11_plus_tmp2 = q1; \
+ row1 = 0; \
+ \
+ q1 = q1 - q6; \
+ q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+ MULTIPLY(row3, -FIX_2_562915447); \
+ q1 = q1 - q6; \
+ q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+ MULTIPLY(row6, FIX_0_541196100); \
+ q3 = q3 - q2; \
+ \
+ /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+ tmp11_minus_tmp2 = q1; \
+ \
+ q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+ q2 = q1 + q6; \
+ q1 = q1 - q6; \
+ \
+ /* pick up the results */ \
+ tmp0 = q4; \
+ tmp1 = q5; \
+ tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+ tmp3 = q7; \
+ tmp10 = q2; \
+ tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+ tmp12 = q3; \
+ tmp13 = q1; \
+}
+
+#define XFIX_0_899976223 d0[0]
+#define XFIX_0_541196100 d0[1]
+#define XFIX_2_562915447 d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865 d1[2]
+#define XFIX_1_175875602 d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+ .short FIX_0_899976223 /* d0[0] */
+ .short FIX_0_541196100 /* d0[1] */
+ .short FIX_2_562915447 /* d0[2] */
+ .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
+ .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
+ .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
+ .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
+ .short FIX_1_175875602 /* d1[3] */
+ /* reloadable constants */
+ .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
+ .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
+ .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
+ .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
+
+asm_function jsimd_idct_islow_neon
+
+ DCT_TABLE .req r0
+ COEF_BLOCK .req r1
+ OUTPUT_BUF .req r2
+ OUTPUT_COL .req r3
+ TMP1 .req r0
+ TMP2 .req r1
+ TMP3 .req r2
+ TMP4 .req ip
+
+ ROW0L .req d16
+ ROW0R .req d17
+ ROW1L .req d18
+ ROW1R .req d19
+ ROW2L .req d20
+ ROW2R .req d21
+ ROW3L .req d22
+ ROW3R .req d23
+ ROW4L .req d24
+ ROW4R .req d25
+ ROW5L .req d26
+ ROW5R .req d27
+ ROW6L .req d28
+ ROW6R .req d29
+ ROW7L .req d30
+ ROW7R .req d31
+
+ /* Load and dequantize coefficients into Neon registers
+ * with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d16 | d17 ( q8 )
+ * 1 | d18 | d19 ( q9 )
+ * 2 | d20 | d21 ( q10 )
+ * 3 | d22 | d23 ( q11 )
+ * 4 | d24 | d25 ( q12 )
+ * 5 | d26 | d27 ( q13 )
+ * 6 | d28 | d29 ( q14 )
+ * 7 | d30 | d31 ( q15 )
+ */
+ adr ip, jsimd_idct_islow_neon_consts
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+ vmul.s16 q8, q8, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q9, q9, q1
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+ vmul.s16 q10, q10, q2
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vmul.s16 q11, q11, q3
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+ vmul.s16 q12, q12, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q14, q14, q2
+ vmul.s16 q13, q13, q1
+ vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
+ add ip, ip, #16
+ vmul.s16 q15, q15, q3
+ vpush {d8 - d15} /* save Neon registers */
+ /* 1-D IDCT, pass 1, left 4x8 half */
+ vadd.s16 d4, ROW7L, ROW3L
+ vadd.s16 d5, ROW5L, ROW1L
+ vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
+ vmlal.s16 q6, d5, XFIX_1_175875602
+ vmull.s16 q7, d4, XFIX_1_175875602
+ /* Check for the zero coefficients in the right 4x8 half */
+ push {r4, r5}
+ vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
+ vsubl.s16 q3, ROW0L, ROW4L
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
+ vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+ orr r0, r4, r5
+ vmov q4, q6
+ vmlsl.s16 q6, ROW5L, XFIX_2_562915447
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+ vshl.s32 q3, q3, #13
+ orr r0, r0, r4
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ orr r0, r0, r5
+ vadd.s32 q1, q3, q2
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ orr r0, r0, r4
+ vmlsl.s16 q7, ROW7L, XFIX_0_899976223
+ orr r0, r0, r5
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+ vrshrn.s32 ROW1L, q1, #11
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+ orr r0, r0, r4
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ orr r0, r0, r5
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+ vmlal.s16 q6, ROW6L, XFIX_0_541196100
+ vsub.s32 q3, q3, q2
+ orr r0, r0, r4
+ vrshrn.s32 ROW6L, q1, #11
+ orr r0, r0, r5
+ vadd.s32 q1, q3, q5
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW0L, ROW4L
+ orr r0, r0, r4
+ vrshrn.s32 ROW2L, q1, #11
+ orr r0, r0, r5
+ vrshrn.s32 ROW5L, q3, #11
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+ orr r0, r0, r4
+ vadd.s32 q2, q5, q6
+ orrs r0, r0, r5
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ orr r0, r4, r5
+ vsub.s32 q3, q1, q4
+ pop {r4, r5}
+ vrshrn.s32 ROW7L, q2, #11
+ vrshrn.s32 ROW3L, q5, #11
+ vrshrn.s32 ROW0L, q6, #11
+ vrshrn.s32 ROW4L, q3, #11
+
+ beq 3f /* Go to do some special handling for the sparse
+ right 4x8 half */
+
+ /* 1-D IDCT, pass 1, right 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vadd.s16 d10, ROW7R, ROW3R
+ vadd.s16 d8, ROW5R, ROW1R
+ /* Transpose left 4x8 half */
+ vtrn.16 ROW6L, ROW7L
+ vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
+ vmlal.s16 q6, d8, XFIX_1_175875602
+ vtrn.16 ROW2L, ROW3L
+ vmull.s16 q7, d10, XFIX_1_175875602
+ vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
+ vtrn.16 ROW0L, ROW1L
+ vsubl.s16 q3, ROW0R, ROW4R
+ vmull.s16 q2, ROW2R, XFIX_0_541196100
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+ vtrn.16 ROW4L, ROW5L
+ vmov q4, q6
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
+ vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
+ vtrn.32 ROW1L, ROW3L
+ vshl.s32 q3, q3, #13
+ vmlsl.s16 q4, ROW1R, XFIX_0_899976223
+ vtrn.32 ROW4L, ROW6L
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ vtrn.32 ROW0L, ROW2L
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
+ vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+ vrshrn.s32 ROW1R, q1, #11
+ vtrn.32 ROW5L, ROW7L
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+ vmlsl.s16 q5, ROW3R, XFIX_2_562915447
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
+ vsub.s32 q3, q3, q2
+ vrshrn.s32 ROW6R, q1, #11
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW0R, ROW4R
+ vrshrn.s32 ROW2R, q1, #11
+ vrshrn.s32 ROW5R, q3, #11
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vrshrn.s32 ROW7R, q2, #11
+ vrshrn.s32 ROW3R, q5, #11
+ vrshrn.s32 ROW0R, q6, #11
+ vrshrn.s32 ROW4R, q3, #11
+ /* Transpose right 4x8 half */
+ vtrn.16 ROW6R, ROW7R
+ vtrn.16 ROW2R, ROW3R
+ vtrn.16 ROW0R, ROW1R
+ vtrn.16 ROW4R, ROW5R
+ vtrn.32 ROW1R, ROW3R
+ vtrn.32 ROW4R, ROW6R
+ vtrn.32 ROW0R, ROW2R
+ vtrn.32 ROW5R, ROW7R
+
+1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW1L, XFIX_1_175875602
+ vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW3L, XFIX_1_175875602
+ vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+ vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
+ vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
+ vmov q4, q6
+ vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+ vshl.s32 q3, q3, #13
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+ vshrn.s32 ROW1L, q1, #16
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW2L, q1, #16
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW3L, q5, #16
+ vshrn.s32 ROW0L, q6, #16
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
+ /* 1-D IDCT, pass 2, right 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW5R, XFIX_1_175875602
+ vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
+ vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
+ vmull.s16 q7, ROW7R, XFIX_1_175875602
+ vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
+ vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
+ vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
+ vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+ vmov q4, q6
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
+ vshl.s32 q3, q3, #13
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vadd.s32 q1, q1, q6
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
+ vsub.s32 q1, q1, q6
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW6R, q1, #16
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
+ vshrn.s32 ROW5R, q3, #16
+ vshl.s32 q5, q5, #13
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW7R, q2, #16
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW4R, q3, #16
+
+2: /* Descale to 8-bit and range limit */
+ vqrshrn.s16 d16, q8, #2
+ vqrshrn.s16 d17, q9, #2
+ vqrshrn.s16 d18, q10, #2
+ vqrshrn.s16 d19, q11, #2
+ vpop {d8 - d15} /* restore Neon registers */
+ vqrshrn.s16 d20, q12, #2
+ /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+ vtrn.16 q8, q9
+ vqrshrn.s16 d21, q13, #2
+ vqrshrn.s16 d22, q14, #2
+ vmov.u8 q0, #(CENTERJSAMPLE)
+ vqrshrn.s16 d23, q15, #2
+ vtrn.8 d16, d17
+ vtrn.8 d18, d19
+ vadd.u8 q8, q8, q0
+ vadd.u8 q9, q9, q0
+ vtrn.16 q10, q11
+ /* Store results to the output buffer */
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d16}, [TMP1]
+ vtrn.8 d20, d21
+ vst1.8 {d17}, [TMP2]
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d18}, [TMP1]
+ vadd.u8 q10, q10, q0
+ vst1.8 {d19}, [TMP2]
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ add TMP3, TMP3, OUTPUT_COL
+ add TMP4, TMP4, OUTPUT_COL
+ vtrn.8 d22, d23
+ vst1.8 {d20}, [TMP1]
+ vadd.u8 q11, q11, q0
+ vst1.8 {d21}, [TMP2]
+ vst1.8 {d22}, [TMP3]
+ vst1.8 {d23}, [TMP4]
+ bx lr
+
+3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+ /* Transpose left 4x8 half */
+ vtrn.16 ROW6L, ROW7L
+ vtrn.16 ROW2L, ROW3L
+ vtrn.16 ROW0L, ROW1L
+ vtrn.16 ROW4L, ROW5L
+ vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
+ vtrn.32 ROW1L, ROW3L
+ vtrn.32 ROW4L, ROW6L
+ vtrn.32 ROW0L, ROW2L
+ vtrn.32 ROW5L, ROW7L
+
+ cmp r0, #0
+ beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
+ pass */
+
+ /* Only row 0 is non-zero for the right 4x8 half */
+ vdup.s16 ROW1R, ROW0R[1]
+ vdup.s16 ROW2R, ROW0R[2]
+ vdup.s16 ROW3R, ROW0R[3]
+ vdup.s16 ROW4R, ROW0R[0]
+ vdup.s16 ROW5R, ROW0R[1]
+ vdup.s16 ROW6R, ROW0R[2]
+ vdup.s16 ROW7R, ROW0R[3]
+ vdup.s16 ROW0R, ROW0R[0]
+ b 1b /* Go to 'normal' second pass */
+
+4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW1L, XFIX_1_175875602
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW3L, XFIX_1_175875602
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
+ vshll.s16 q3, ROW0L, #13
+ vmov q4, q6
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+ vadd.s32 q1, q1, q6
+ vadd.s32 q6, q6, q6
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
+ vshrn.s32 ROW1L, q1, #16
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vshll.s16 q5, ROW0L, #13
+ vshrn.s32 ROW2L, q1, #16
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW3L, q5, #16
+ vshrn.s32 ROW0L, q6, #16
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
+ /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
+ vmull.s16 q6, ROW5L, XFIX_1_175875602
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+ vmull.s16 q7, ROW7L, XFIX_1_175875602
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+ vmull.s16 q2, ROW6L, XFIX_0_541196100
+ vshll.s16 q3, ROW4L, #13
+ vmov q4, q6
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223
+ vadd.s32 q1, q3, q2
+ vmov q5, q7
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+ vadd.s32 q1, q1, q6
+ vadd.s32 q6, q6, q6
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
+ vsub.s32 q1, q1, q6
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+ vsub.s32 q3, q3, q2
+ vshrn.s32 ROW6R, q1, #16
+ vadd.s32 q1, q3, q5
+ vsub.s32 q3, q3, q5
+ vshll.s16 q5, ROW4L, #13
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
+ vshrn.s32 ROW5R, q3, #16
+ vadd.s32 q2, q5, q6
+ vsub.s32 q1, q5, q6
+ vadd.s32 q6, q2, q7
+ vsub.s32 q2, q2, q7
+ vadd.s32 q5, q1, q4
+ vsub.s32 q3, q1, q4
+ vshrn.s32 ROW7R, q2, #16
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
+ vshrn.s32 ROW4R, q3, #16
+ b 2b /* Go to epilogue */
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq TMP4
+
+ .unreq ROW0L
+ .unreq ROW0R
+ .unreq ROW1L
+ .unreq ROW1R
+ .unreq ROW2L
+ .unreq ROW2R
+ .unreq ROW3L
+ .unreq ROW3R
+ .unreq ROW4L
+ .unreq ROW4R
+ .unreq ROW5L
+ .unreq ROW5R
+ .unreq ROW6L
+ .unreq ROW6R
+ .unreq ROW7L
+ .unreq ROW7R
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in Arm Neon case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200 d0[0]
+#define XFIX_1_414213562 d0[1]
+#define XFIX_1_847759065 d0[2]
+#define XFIX_2_613125930 d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+ DCT_TABLE .req r0
+ COEF_BLOCK .req r1
+ OUTPUT_BUF .req r2
+ OUTPUT_COL .req r3
+ TMP1 .req r0
+ TMP2 .req r1
+ TMP3 .req r2
+ TMP4 .req ip
+
+ /* Load and dequantize coefficients into Neon registers
+ * with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d16 | d17 ( q8 )
+ * 1 | d18 | d19 ( q9 )
+ * 2 | d20 | d21 ( q10 )
+ * 3 | d22 | d23 ( q11 )
+ * 4 | d24 | d25 ( q12 )
+ * 5 | d26 | d27 ( q13 )
+ * 6 | d28 | d29 ( q14 )
+ * 7 | d30 | d31 ( q15 )
+ */
+ adr ip, jsimd_idct_ifast_neon_consts
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+ vmul.s16 q8, q8, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q9, q9, q1
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+ vmul.s16 q10, q10, q2
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+ vmul.s16 q11, q11, q3
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+ vmul.s16 q12, q12, q0
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+ vmul.s16 q14, q14, q2
+ vmul.s16 q13, q13, q1
+ vld1.16 {d0}, [ip, :64] /* load constants */
+ vmul.s16 q15, q15, q3
+ vpush {d8 - d13} /* save Neon registers */
+ /* 1-D IDCT, pass 1 */
+ vsub.s16 q2, q10, q14
+ vadd.s16 q14, q10, q14
+ vsub.s16 q1, q11, q13
+ vadd.s16 q13, q11, q13
+ vsub.s16 q5, q9, q15
+ vadd.s16 q15, q9, q15
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
+ vadd.s16 q3, q1, q1
+ vsub.s16 q1, q5, q1
+ vadd.s16 q10, q2, q4
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
+ vsub.s16 q2, q15, q13
+ vadd.s16 q3, q3, q6
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
+ vadd.s16 q1, q1, q4
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
+ vsub.s16 q10, q10, q14
+ vadd.s16 q2, q2, q6
+ vsub.s16 q6, q8, q12
+ vadd.s16 q12, q8, q12
+ vadd.s16 q9, q5, q4
+ vadd.s16 q5, q6, q10
+ vsub.s16 q10, q6, q10
+ vadd.s16 q6, q15, q13
+ vadd.s16 q8, q12, q14
+ vsub.s16 q3, q6, q3
+ vsub.s16 q12, q12, q14
+ vsub.s16 q3, q3, q1
+ vsub.s16 q1, q9, q1
+ vadd.s16 q2, q3, q2
+ vsub.s16 q15, q8, q6
+ vadd.s16 q1, q1, q2
+ vadd.s16 q8, q8, q6
+ vadd.s16 q14, q5, q3
+ vsub.s16 q9, q5, q3
+ vsub.s16 q13, q10, q2
+ vadd.s16 q10, q10, q2
+ /* Transpose */
+ vtrn.16 q8, q9
+ vsub.s16 q11, q12, q1
+ vtrn.16 q14, q15
+ vadd.s16 q12, q12, q1
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q8, q10
+ vtrn.32 q13, q15
+ vswp d28, d21
+ vswp d26, d19
+ /* 1-D IDCT, pass 2 */
+ vsub.s16 q2, q10, q14
+ vswp d30, d23
+ vadd.s16 q14, q10, q14
+ vswp d24, d17
+ vsub.s16 q1, q11, q13
+ vadd.s16 q13, q11, q13
+ vsub.s16 q5, q9, q15
+ vadd.s16 q15, q9, q15
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
+ vadd.s16 q3, q1, q1
+ vsub.s16 q1, q5, q1
+ vadd.s16 q10, q2, q4
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
+ vsub.s16 q2, q15, q13
+ vadd.s16 q3, q3, q6
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
+ vadd.s16 q1, q1, q4
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
+ vsub.s16 q10, q10, q14
+ vadd.s16 q2, q2, q6
+ vsub.s16 q6, q8, q12
+ vadd.s16 q12, q8, q12
+ vadd.s16 q9, q5, q4
+ vadd.s16 q5, q6, q10
+ vsub.s16 q10, q6, q10
+ vadd.s16 q6, q15, q13
+ vadd.s16 q8, q12, q14
+ vsub.s16 q3, q6, q3
+ vsub.s16 q12, q12, q14
+ vsub.s16 q3, q3, q1
+ vsub.s16 q1, q9, q1
+ vadd.s16 q2, q3, q2
+ vsub.s16 q15, q8, q6
+ vadd.s16 q1, q1, q2
+ vadd.s16 q8, q8, q6
+ vadd.s16 q14, q5, q3
+ vsub.s16 q9, q5, q3
+ vsub.s16 q13, q10, q2
+ vpop {d8 - d13} /* restore Neon registers */
+ vadd.s16 q10, q10, q2
+ vsub.s16 q11, q12, q1
+ vadd.s16 q12, q12, q1
+ /* Descale to 8-bit and range limit */
+ vmov.u8 q0, #0x80
+ vqshrn.s16 d16, q8, #5
+ vqshrn.s16 d17, q9, #5
+ vqshrn.s16 d18, q10, #5
+ vqshrn.s16 d19, q11, #5
+ vqshrn.s16 d20, q12, #5
+ vqshrn.s16 d21, q13, #5
+ vqshrn.s16 d22, q14, #5
+ vqshrn.s16 d23, q15, #5
+ vadd.u8 q8, q8, q0
+ vadd.u8 q9, q9, q0
+ vadd.u8 q10, q10, q0
+ vadd.u8 q11, q11, q0
+ /* Transpose the final 8-bit samples */
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.8 d16, d17
+ vtrn.8 d18, d19
+ /* Store results to the output buffer */
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d16}, [TMP1]
+ vst1.8 {d17}, [TMP2]
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ vst1.8 {d18}, [TMP1]
+ vtrn.8 d20, d21
+ vst1.8 {d19}, [TMP2]
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+ add TMP1, TMP1, OUTPUT_COL
+ add TMP2, TMP2, OUTPUT_COL
+ add TMP3, TMP3, OUTPUT_COL
+ add TMP4, TMP4, OUTPUT_COL
+ vst1.8 {d20}, [TMP1]
+ vtrn.8 d22, d23
+ vst1.8 {d21}, [TMP2]
+ vst1.8 {d22}, [TMP3]
+ vst1.8 {d23}, [TMP4]
+ bx lr
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq TMP4
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+ .if \size == 8
+ vst1.8 {d20}, [Y]!
+ vst1.8 {d21}, [U]!
+ vst1.8 {d22}, [V]!
+ .elseif \size == 4
+ vst1.8 {d20[0]}, [Y]!
+ vst1.8 {d20[1]}, [Y]!
+ vst1.8 {d20[2]}, [Y]!
+ vst1.8 {d20[3]}, [Y]!
+ vst1.8 {d21[0]}, [U]!
+ vst1.8 {d21[1]}, [U]!
+ vst1.8 {d21[2]}, [U]!
+ vst1.8 {d21[3]}, [U]!
+ vst1.8 {d22[0]}, [V]!
+ vst1.8 {d22[1]}, [V]!
+ vst1.8 {d22[2]}, [V]!
+ vst1.8 {d22[3]}, [V]!
+ .elseif \size == 2
+ vst1.8 {d20[4]}, [Y]!
+ vst1.8 {d20[5]}, [Y]!
+ vst1.8 {d21[4]}, [U]!
+ vst1.8 {d21[5]}, [U]!
+ vst1.8 {d22[4]}, [V]!
+ vst1.8 {d22[5]}, [V]!
+ .elseif \size == 1
+ vst1.8 {d20[6]}, [Y]!
+ vst1.8 {d21[6]}, [U]!
+ vst1.8 {d22[6]}, [V]!
+ .else
+ .error unsupported macroblock size
+ .endif
+.endm
+
+.macro do_load bpp, size
+ .if \bpp == 24
+ .if \size == 8
+ vld3.8 {d10, d11, d12}, [RGB]!
+ pld [RGB, #128]
+ .elseif \size == 4
+ vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
+ vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
+ vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
+ vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
+ .elseif \size == 2
+ vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
+ vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
+ .elseif \size == 1
+ vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
+ .else
+ .error unsupported macroblock size
+ .endif
+ .elseif \bpp == 32
+ .if \size == 8
+ vld4.8 {d10, d11, d12, d13}, [RGB]!
+ pld [RGB, #128]
+ .elseif \size == 4
+ vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+ vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+ vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+ vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+ .elseif \size == 2
+ vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+ vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+ .elseif \size == 1
+ vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+ .else
+ .error unsupported macroblock size
+ .endif
+ .else
+ .error unsupported bpp
+ .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
+ vmull.u16 q7, d4, d0[0]
+ vmlal.u16 q7, d6, d0[1]
+ vmlal.u16 q7, d8, d0[2]
+ vmull.u16 q8, d5, d0[0]
+ vmlal.u16 q8, d7, d0[1]
+ vmlal.u16 q8, d9, d0[2]
+ vrev64.32 q9, q1
+ vrev64.32 q13, q1
+ vmlsl.u16 q9, d4, d0[3]
+ vmlsl.u16 q9, d6, d1[0]
+ vmlal.u16 q9, d8, d1[1]
+ vmlsl.u16 q13, d5, d0[3]
+ vmlsl.u16 q13, d7, d1[0]
+ vmlal.u16 q13, d9, d1[1]
+ vrev64.32 q14, q1
+ vrev64.32 q15, q1
+ vmlal.u16 q14, d4, d1[1]
+ vmlsl.u16 q14, d6, d1[2]
+ vmlsl.u16 q14, d8, d1[3]
+ vmlal.u16 q15, d5, d1[1]
+ vmlsl.u16 q15, d7, d1[2]
+ vmlsl.u16 q15, d9, d1[3]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+ vrshrn.u32 d20, q7, #16
+ vrshrn.u32 d21, q8, #16
+ vshrn.u32 d22, q9, #16
+ vshrn.u32 d23, q13, #16
+ vshrn.u32 d24, q14, #16
+ vshrn.u32 d25, q15, #16
+ vmovn.u16 d20, q10 /* d20 = y */
+ vmovn.u16 d21, q11 /* d21 = u */
+ vmovn.u16 d22, q12 /* d22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+ do_rgb_to_yuv_stage1
+ do_rgb_to_yuv_stage2
+.endm
+
+.macro do_rgb_to_yuv_stage2_store_load_stage1
+ vrshrn.u32 d20, q7, #16
+ vrshrn.u32 d21, q8, #16
+ vshrn.u32 d22, q9, #16
+ vrev64.32 q9, q1
+ vshrn.u32 d23, q13, #16
+ vrev64.32 q13, q1
+ vshrn.u32 d24, q14, #16
+ vshrn.u32 d25, q15, #16
+ do_load \bpp, 8
+ vmovn.u16 d20, q10 /* d20 = y */
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
+ vmovn.u16 d21, q11 /* d21 = u */
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
+ vmovn.u16 d22, q12 /* d22 = v */
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
+ vmull.u16 q7, d4, d0[0]
+ vmlal.u16 q7, d6, d0[1]
+ vmlal.u16 q7, d8, d0[2]
+ vst1.8 {d20}, [Y]!
+ vmull.u16 q8, d5, d0[0]
+ vmlal.u16 q8, d7, d0[1]
+ vmlal.u16 q8, d9, d0[2]
+ vmlsl.u16 q9, d4, d0[3]
+ vmlsl.u16 q9, d6, d1[0]
+ vmlal.u16 q9, d8, d1[1]
+ vst1.8 {d21}, [U]!
+ vmlsl.u16 q13, d5, d0[3]
+ vmlsl.u16 q13, d7, d1[0]
+ vmlal.u16 q13, d9, d1[1]
+ vrev64.32 q14, q1
+ vrev64.32 q15, q1
+ vmlal.u16 q14, d4, d1[1]
+ vmlsl.u16 q14, d6, d1[2]
+ vmlsl.u16 q14, d8, d1[3]
+ vst1.8 {d22}, [V]!
+ vmlal.u16 q15, d5, d1[1]
+ vmlsl.u16 q15, d7, d1[2]
+ vmlsl.u16 q15, d9, d1[3]
+.endm
+
+.balign 16
+jsimd_\colorid\()_ycc_neon_consts:
+ .short 19595, 38470, 7471, 11059
+ .short 21709, 32768, 27439, 5329
+ .short 32767, 128, 32767, 128
+ .short 32767, 128, 32767, 128
+
+asm_function jsimd_\colorid\()_ycc_convert_neon
+ OUTPUT_WIDTH .req r0
+ INPUT_BUF .req r1
+ OUTPUT_BUF .req r2
+ OUTPUT_ROW .req r3
+ NUM_ROWS .req r4
+
+ OUTPUT_BUF0 .req r5
+ OUTPUT_BUF1 .req r6
+ OUTPUT_BUF2 .req OUTPUT_BUF
+
+ RGB .req r7
+ Y .req r8
+ U .req r9
+ V .req r10
+ N .req ip
+
+ /* Load constants to d0, d1, d2, d3 */
+ adr ip, jsimd_\colorid\()_ycc_neon_consts
+ vld1.16 {d0, d1, d2, d3}, [ip, :128]
+
+ /* Save Arm registers and handle input arguments */
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
+ ldr NUM_ROWS, [sp, #(4 * 8)]
+ ldr OUTPUT_BUF0, [OUTPUT_BUF]
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
+ .unreq OUTPUT_BUF
+
+ /* Save Neon registers */
+ vpush {d8 - d15}
+
+ /* Outer loop over scanlines */
+ cmp NUM_ROWS, #1
+ blt 9f
+0:
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
+ mov N, OUTPUT_WIDTH
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
+ add OUTPUT_ROW, OUTPUT_ROW, #1
+ ldr RGB, [INPUT_BUF], #4
+
+ /* Inner loop over pixels */
+ subs N, N, #8
+ blt 3f
+ do_load \bpp, 8
+ do_rgb_to_yuv_stage1
+ subs N, N, #8
+ blt 2f
+1:
+ do_rgb_to_yuv_stage2_store_load_stage1
+ subs N, N, #8
+ bge 1b
+2:
+ do_rgb_to_yuv_stage2
+ do_store 8
+ tst N, #7
+ beq 8f
+3:
+ tst N, #4
+ beq 3f
+ do_load \bpp, 4
+3:
+ tst N, #2
+ beq 4f
+ do_load \bpp, 2
+4:
+ tst N, #1
+ beq 5f
+ do_load \bpp, 1
+5:
+ do_rgb_to_yuv
+ tst N, #4
+ beq 6f
+ do_store 4
+6:
+ tst N, #2
+ beq 7f
+ do_store 2
+7:
+ tst N, #1
+ beq 8f
+ do_store 1
+8:
+ subs NUM_ROWS, NUM_ROWS, #1
+ bgt 0b
+9:
+ /* Restore all registers and return */
+ vpop {d8 - d15}
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
+
+ .unreq OUTPUT_WIDTH
+ .unreq OUTPUT_ROW
+ .unreq INPUT_BUF
+ .unreq NUM_ROWS
+ .unreq OUTPUT_BUF0
+ .unreq OUTPUT_BUF1
+ .unreq OUTPUT_BUF2
+ .unreq RGB
+ .unreq Y
+ .unreq U
+ .unreq V
+ .unreq N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R G B */
+generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
diff --git a/media/libjpeg/simd/arm/aarch64/jccolext-neon.c b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
new file mode 100644
index 0000000000..37130c225e
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
@@ -0,0 +1,316 @@
+/*
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * 0.16874695 = 11059 * 2^-16
+ * 0.33125305 = 21709 * 2^-16
+ * 0.50000000 = 32768 * 2^-16
+ * 0.41868592 = 27439 * 2^-16
+ * 0.08131409 = 5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ /* Pointer to RGB(X/A) input data */
+ JSAMPROW inptr;
+ /* Pointers to Y, Cb, and Cr output data */
+ JSAMPROW outptr0, outptr1, outptr2;
+ /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+ /* Set up conversion constants. */
+ const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+ const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+ uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+ uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+ uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_ll = scaled_128_5;
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+ cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+ uint32x4_t cb_lh = scaled_128_5;
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+ cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+ uint32x4_t cb_hl = scaled_128_5;
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+ cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+ uint32x4_t cb_hh = scaled_128_5;
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+ cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_ll = scaled_128_5;
+ cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+ uint32x4_t cr_lh = scaled_128_5;
+ cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+ uint32x4_t cr_hl = scaled_128_5;
+ cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+ uint32x4_t cr_hh = scaled_128_5;
+ cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+ vshrn_n_u32(cb_lh, 16));
+ uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+ vshrn_n_u32(cb_hh, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+ vshrn_n_u32(cr_lh, 16));
+ uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+ vshrn_n_u32(cr_hh, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+ vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+ vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+ /* Increment pointers. */
+ inptr += (16 * RGB_PIXELSIZE);
+ outptr0 += 16;
+ outptr1 += 16;
+ outptr2 += 16;
+ }
+
+ if (cols_remaining > 8) {
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 16) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+ uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+ uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+ uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_ll = scaled_128_5;
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+ cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+ uint32x4_t cb_lh = scaled_128_5;
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+ cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+ uint32x4_t cb_hl = scaled_128_5;
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+ cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+ uint32x4_t cb_hh = scaled_128_5;
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+ cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_ll = scaled_128_5;
+ cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+ uint32x4_t cr_lh = scaled_128_5;
+ cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+ uint32x4_t cr_hl = scaled_128_5;
+ cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+ uint32x4_t cr_hh = scaled_128_5;
+ cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+ vshrn_n_u32(cb_lh, 16));
+ uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+ vshrn_n_u32(cb_hh, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+ vshrn_n_u32(cr_lh, 16));
+ uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+ vshrn_n_u32(cr_hh, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+ vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+ vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+ } else if (cols_remaining > 0) {
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 8) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+ uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+ uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+ y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+ y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+ uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+ y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+ y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_l = scaled_128_5;
+ cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+ cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+ cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+ uint32x4_t cb_h = scaled_128_5;
+ cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+ cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+ cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_l = scaled_128_5;
+ cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+ cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+ cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+ uint32x4_t cr_h = scaled_128_5;
+ cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+ cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+ cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+ vrshrn_n_u32(y_h, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+ vshrn_n_u32(cb_h, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+ vshrn_n_u32(cr_h, 16));
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1_u8(outptr0, vmovn_u16(y_u16));
+ vst1_u8(outptr1, vmovn_u16(cb_u16));
+ vst1_u8(outptr2, vmovn_u16(cr_u16));
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jchuff-neon.c b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
new file mode 100644
index 0000000000..607a116070
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,411 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+ 0, 1, 2, 3, 16, 17, 32, 33,
+ 18, 19, 4, 5, 6, 7, 20, 21,
+ 34, 35, 48, 49, 255, 255, 50, 51,
+ 36, 37, 22, 23, 8, 9, 10, 11,
+ 255, 255, 6, 7, 20, 21, 34, 35,
+ 48, 49, 255, 255, 50, 51, 36, 37,
+ 54, 55, 40, 41, 26, 27, 12, 13,
+ 14, 15, 28, 29, 42, 43, 56, 57,
+ 6, 7, 20, 21, 34, 35, 48, 49,
+ 50, 51, 36, 37, 22, 23, 8, 9,
+ 26, 27, 12, 13, 255, 255, 14, 15,
+ 28, 29, 42, 43, 56, 57, 255, 255,
+ 52, 53, 54, 55, 40, 41, 26, 27,
+ 12, 13, 255, 255, 14, 15, 28, 29,
+ 26, 27, 40, 41, 42, 43, 28, 29,
+ 14, 15, 30, 31, 44, 45, 46, 47
+};
+
+/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
+ * address warning because the macro sometimes writes a 64-bit value to a
+ * non-64-bit-aligned address. That behavior is technically undefined per
+ * the C specification, but it is supported by the AArch64 architecture and
+ * compilers.
+ */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("alignment")))
+#endif
+#endif
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ uint16_t block_diff[DCTSIZE2];
+
+ /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+ const uint8x16x4_t idx_rows_0123 =
+ vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+ const uint8x16x4_t idx_rows_4567 =
+ vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+ /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+ const uint8x16x4_t idx_rows_0123 = { {
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+ } };
+ const uint8x16x4_t idx_rows_4567 = { {
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+ } };
+#endif
+
+ /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+ const int8x16x4_t tbl_rows_0123 =
+ vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+ const int8x16x4_t tbl_rows_4567 =
+ vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+ const int8x16x4_t tbl_rows_0123 = { {
+ vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+ } };
+ const int8x16x4_t tbl_rows_4567 = { {
+ vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+ } };
+#endif
+
+ /* Initialise extra lookup tables. */
+ const int8x16x4_t tbl_rows_2345 = { {
+ tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+ tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+ } };
+ const int8x16x3_t tbl_rows_567 =
+ { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+ /* Shuffle coefficients into zig-zag order. */
+ int16x8_t row0 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+ int16x8_t row1 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+ int16x8_t row2 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+ int16x8_t row3 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+ int16x8_t row4 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+ int16x8_t row5 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+ int16x8_t row6 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+ int16x8_t row7 =
+ vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+ /* Compute DC coefficient difference value (F.1.1.5.1). */
+ row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+ /* Initialize AC coefficient lanes not reachable by lookup tables. */
+ row1 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+ 0), row1, 2);
+ row2 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+ 4), row2, 0);
+ row2 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+ 0), row2, 5);
+ row5 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+ 7), row5, 2);
+ row5 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+ 3), row5, 7);
+ row6 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+ 7), row6, 5);
+
+ /* DCT block is now in zig-zag order; start Huffman encoding process. */
+
+ /* Construct bitmap to accelerate encoding of AC coefficients. A set bit
+ * means that the corresponding coefficient != 0.
+ */
+ uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
+ uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
+ uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
+ uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
+ uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
+ uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
+ uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
+ uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
+
+ uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
+ vreinterpretq_u8_u16(row0_ne_0));
+ uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
+ vreinterpretq_u8_u16(row2_ne_0));
+ uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
+ vreinterpretq_u8_u16(row4_ne_0));
+ uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
+ vreinterpretq_u8_u16(row6_ne_0));
+
+ /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+ const uint8x16_t bitmap_mask =
+ vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
+
+ uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
+
+ uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
+ uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
+ uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
+ bitmap_rows_3210);
+ uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
+ vget_high_u8(bitmap_rows_76543210));
+
+ /* Shift left to remove DC bit. */
+ bitmap_all =
+ vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+ /* Count bits set (number of non-zero coefficients) in bitmap. */
+ unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+ /* Move bitmap to 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+ /* Set up state and bit buffer for output bitstream. */
+ working_state *state_ptr = (working_state *)state;
+ int free_bits = state_ptr->cur.free_bits;
+ size_t put_buffer = state_ptr->cur.put_buffer;
+
+ /* Encode DC coefficient. */
+
+ /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+ int16x8_t abs_row0 = vabsq_s16(row0);
+ int16x8_t row0_lz = vclzq_s16(abs_row0);
+ uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
+ uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+ /* Find nbits required to specify sign and amplitude of coefficient. */
+ unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
+ unsigned int nbits = 16 - lz;
+ /* Emit Huffman-coded symbol and additional diff bits. */
+ unsigned int diff = vgetq_lane_u16(row0_diff, 0);
+ PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+ /* Encode AC coefficients. */
+
+ unsigned int r = 0; /* r = run length of zeros */
+ unsigned int i = 1; /* i = number of coefficients encoded */
+ /* Code and size information for a run length of 16 zero coefficients */
+ const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+ const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+ /* The most efficient method of computing nbits and diff depends on the
+ * number of non-zero coefficients. If the bitmap is not too sparse (> 8
+ * non-zero AC coefficients), it is beneficial to do all of the work using
+ * Neon; else we do some of the work using Neon and the rest on demand using
+ * scalar code.
+ */
+ if (non_zero_coefficients > 8) {
+ uint8_t block_nbits[DCTSIZE2];
+
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+ int16x8_t row1_lz = vclzq_s16(abs_row1);
+ int16x8_t row2_lz = vclzq_s16(abs_row2);
+ int16x8_t row3_lz = vclzq_s16(abs_row3);
+ int16x8_t row4_lz = vclzq_s16(abs_row4);
+ int16x8_t row5_lz = vclzq_s16(abs_row5);
+ int16x8_t row6_lz = vclzq_s16(abs_row6);
+ int16x8_t row7_lz = vclzq_s16(abs_row7);
+ /* Narrow leading zero count to 8 bits. */
+ uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
+ vreinterpretq_u8_s16(row1_lz));
+ uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
+ vreinterpretq_u8_s16(row3_lz));
+ uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
+ vreinterpretq_u8_s16(row5_lz));
+ uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
+ vreinterpretq_u8_s16(row7_lz));
+ /* Compute nbits needed to specify magnitude of each coefficient. */
+ uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
+ uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
+ uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
+ uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
+ /* Store nbits. */
+ vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
+ vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
+ vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
+ vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
+ /* Mask bits not required to specify sign and amplitude of diff. */
+ uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
+ uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
+ uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
+ uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
+ uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
+ uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
+ uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
+ /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+ row1_mask);
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+ row2_mask);
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+ row3_mask);
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+ row4_mask);
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+ row5_mask);
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+ row6_mask);
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+ row7_mask);
+ /* Store diff bits. */
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ while (bitmap != 0) {
+ r = BUILTIN_CLZLL(bitmap);
+ i += r;
+ bitmap <<= r;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap <<= 1;
+ }
+ } else if (bitmap != 0) {
+ uint16_t block_abs[DCTSIZE2];
+ /* Compute and store absolute value of coefficients. */
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+ vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+ vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+ vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+ vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+ vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+ vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+ vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+ vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+ /* Compute diff bits (without nbits mask) and store. */
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+ vcltzq_s16(row1));
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+ vcltzq_s16(row2));
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+ vcltzq_s16(row3));
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+ vcltzq_s16(row4));
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+ vcltzq_s16(row5));
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+ vcltzq_s16(row6));
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+ vcltzq_s16(row7));
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ /* Same as above but must mask diff bits and compute nbits on demand. */
+ while (bitmap != 0) {
+ r = BUILTIN_CLZLL(bitmap);
+ i += r;
+ bitmap <<= r;
+ lz = BUILTIN_CLZ(block_abs[i]);
+ nbits = 32 - lz;
+ diff = ((unsigned int)block_diff[i] << lz) >> lz;
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap <<= 1;
+ }
+ }
+
+ /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+ * The value of RS for the EOB code is 0.
+ */
+ if (i != 64) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
+
+ state_ptr->cur.put_buffer = put_buffer;
+ state_ptr->cur.free_bits = free_bits;
+
+ return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jsimd.c b/media/libjpeg/simd/arm/aarch64/jsimd.c
new file mode 100644
index 0000000000..41c06d3180
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jsimd.c
@@ -0,0 +1,1056 @@
+/*
+ * jsimd_arm64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "jconfigint.h"
+
+#include <ctype.h>
+
+#define JSIMD_FASTLD3 1
+#define JSIMD_FASTST3 2
+#define JSIMD_FASTTBL 4
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+ JSIMD_FASTTBL;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_cpuinfo(char *buffer, const char *field, char *value)
+{
+ char *p;
+
+ if (*value == 0)
+ return 0;
+ if (strncmp(buffer, field, strlen(field)) != 0)
+ return 0;
+ buffer += strlen(field);
+ while (isspace(*buffer))
+ buffer++;
+
+ /* Check if 'value' is present in the buffer as a separate word */
+ while ((p = strstr(buffer, value))) {
+ if (p > buffer && !isspace(*(p - 1))) {
+ buffer++;
+ continue;
+ }
+ p += strlen(value);
+ if (*p != 0 && !isspace(*p)) {
+ buffer++;
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+ char *buffer = (char *)malloc(bufsize);
+ FILE *fd;
+
+ if (!buffer)
+ return 0;
+
+ fd = fopen("/proc/cpuinfo", "r");
+ if (fd) {
+ while (fgets(buffer, bufsize, fd)) {
+ if (!strchr(buffer, '\n') && !feof(fd)) {
+ /* "impossible" happened - insufficient size of the buffer! */
+ fclose(fd);
+ free(buffer);
+ return 0;
+ }
+ if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+ check_cpuinfo(buffer, "CPU part", "0xd07"))
+ /* The Cortex-A53 has a slow tbl implementation. We can gain a few
+ percent speedup by disabling the use of that instruction. The
+ speedup on Cortex-A57 is more subtle but still measurable. */
+ simd_features &= ~JSIMD_FASTTBL;
+ else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+ /* The SIMD version of Huffman encoding is slower than the C version on
+ Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that
+ CPU. */
+ simd_huffman = simd_features = 0;
+ }
+ fclose(fd);
+ }
+ free(buffer);
+ return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+
+/*
+ * Armv8 architectures support Neon extensions by default.
+ * It is no longer optional as it was with Armv7.
+ */
+
+
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char env[2] = { 0 };
+#endif
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = 0;
+
+ simd_support |= JSIMD_NEON;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+ while (!parse_proc_cpuinfo(bufsize)) {
+ bufsize *= 2;
+ if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+ break;
+ }
+#endif
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+ simd_support = JSIMD_NEON;
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+ simd_support = 0;
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+ simd_huffman = 0;
+ if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1"))
+ simd_features |= JSIMD_FASTLD3;
+ if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0"))
+ simd_features &= ~JSIMD_FASTLD3;
+ if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1"))
+ simd_features |= JSIMD_FASTST3;
+ if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0"))
+ simd_features &= ~JSIMD_FASTST3;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_ycc_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+#endif
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_ycc_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_ycc_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_ycc_convert_neon;
+ break;
+ default:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_gray_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_gray_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_gray_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_ycc_extrgbx_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+#endif
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_ycc_extbgrx_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_ycc_extxbgr_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_ycc_extxrgb_convert_neon;
+ break;
+ default:
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+ output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON && simd_huffman)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTTBL)
+#endif
+ return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+#ifndef NEON_INTRINSICS
+ else
+ return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+ last_dc_val, dctbl, actbl);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 8)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 8)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, JCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+ jpeg_natural_order_start,
+ Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jsimd_neon.S b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
new file mode 100644
index 0000000000..738a4f0658
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
@@ -0,0 +1,2254 @@
+/*
+ * Armv8 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ * All Rights Reserved.
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved.
+ * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014-2016, 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
+ * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
+#endif
+
+#if defined(__APPLE__)
+.section __DATA, __const
+#elif defined(_WIN32)
+.section .rdata
+#else
+.section .rodata, "a", %progbits
+#endif
+
+/* Constants for jsimd_idct_islow_neon() */
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_idct_islow_neon_consts:
+ .short F_0_298
+ .short -F_0_390
+ .short F_0_541
+ .short F_0_765
+ .short - F_0_899
+ .short F_1_175
+ .short F_1_501
+ .short - F_1_847
+ .short - F_1_961
+ .short F_2_053
+ .short - F_2_562
+ .short F_3_072
+ .short 0 /* padding */
+ .short 0
+ .short 0
+ .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_ycc_*_neon() */
+
+.balign 16
+Ljsimd_ycc_rgb_neon_consts:
+ .short 0, 0, 0, 0
+ .short 22971, -11277, -23401, 29033
+ .short -128, -128, -128, -128
+ .short -128, -128, -128, -128
+
+/* Constants for jsimd_*_ycc_neon() */
+
+.balign 16
+Ljsimd_rgb_ycc_neon_consts:
+ .short 19595, 38470, 7471, 11059
+ .short 21709, 32768, 27439, 5329
+ .short 32767, 128, 32767, 128
+ .short 32767, 128, 32767, 128
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+ .short F_0_298
+ .short -F_0_390
+ .short F_0_541
+ .short F_0_765
+ .short - F_0_899
+ .short F_1_175
+ .short F_1_501
+ .short - F_1_847
+ .short - F_1_961
+ .short F_2_053
+ .short - F_2_562
+ .short F_3_072
+ .short 0 /* padding */
+ .short 0
+ .short 0
+ .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+ .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+ .byte 0, 1, 2, 3, 16, 17, 32, 33, \
+ 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
+ .byte 34, 35, 48, 49, 255, 255, 50, 51, \
+ 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
+ .byte 8, 9, 22, 23, 36, 37, 50, 51, \
+ 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
+ .byte 54, 55, 40, 41, 26, 27, 12, 13, \
+ 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
+ .byte 6, 7, 20, 21, 34, 35, 48, 49, \
+ 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
+ .byte 42, 43, 28, 29, 14, 15, 30, 31, \
+ 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
+ .byte 255, 255, 255, 255, 56, 57, 42, 43, \
+ 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
+ .byte 26, 27, 40, 41, 42, 43, 28, 29, \
+ 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
+ .byte 255, 255, 255, 255, 0, 1, 255, 255, \
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
+ 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
+ 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
+ .byte 4, 5, 6, 7, 255, 255, 255, 255, \
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
+
+.text
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+ .private_extern _\fname
+ .globl _\fname
+_\fname:
+#else
+ .global \fname
+#ifdef __ELF__
+ .hidden \fname
+ .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+ adrp \reg, \symbol@PAGE
+ add \reg, \reg, \symbol@PAGEOFF
+#else
+ adrp \reg, \symbol
+ add \reg, \reg, :lo12:\symbol
+#endif
+.endm
+
+.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
+ trn1 \t0\().8h, \l0\().8h, \l1\().8h
+ trn1 \t1\().8h, \l2\().8h, \l3\().8h
+ trn1 \t2\().8h, \l4\().8h, \l5\().8h
+ trn1 \t3\().8h, \l6\().8h, \l7\().8h
+ trn2 \l1\().8h, \l0\().8h, \l1\().8h
+ trn2 \l3\().8h, \l2\().8h, \l3\().8h
+ trn2 \l5\().8h, \l4\().8h, \l5\().8h
+ trn2 \l7\().8h, \l6\().8h, \l7\().8h
+
+ trn1 \l4\().4s, \t2\().4s, \t3\().4s
+ trn2 \t3\().4s, \t2\().4s, \t3\().4s
+ trn1 \t2\().4s, \t0\().4s, \t1\().4s
+ trn2 \l2\().4s, \t0\().4s, \t1\().4s
+ trn1 \t0\().4s, \l1\().4s, \l3\().4s
+ trn2 \l3\().4s, \l1\().4s, \l3\().4s
+ trn2 \t1\().4s, \l5\().4s, \l7\().4s
+ trn1 \l5\().4s, \l5\().4s, \l7\().4s
+
+ trn2 \l6\().2d, \l2\().2d, \t3\().2d
+ trn1 \l0\().2d, \t2\().2d, \l4\().2d
+ trn1 \l1\().2d, \t0\().2d, \l5\().2d
+ trn2 \l7\().2d, \l3\().2d, \t1\().2d
+ trn1 \l2\().2d, \l2\().2d, \t3\().2d
+ trn2 \l4\().2d, \t2\().2d, \l4\().2d
+ trn1 \l3\().2d, \l3\().2d, \t1\().2d
+ trn2 \l5\().2d, \t0\().2d, \l5\().2d
+.endm
+
+
+#define CENTERJSAMPLE 128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
+
+asm_function jsimd_idct_islow_neon
+ DCT_TABLE .req x0
+ COEF_BLOCK .req x1
+ OUTPUT_BUF .req x2
+ OUTPUT_COL .req x3
+ TMP1 .req x0
+ TMP2 .req x1
+ TMP3 .req x9
+ TMP4 .req x10
+ TMP5 .req x11
+ TMP6 .req x12
+ TMP7 .req x13
+ TMP8 .req x14
+
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
+ instruction ensures that those bits are set to zero. */
+ uxtw x3, w3
+
+ sub sp, sp, #64
+ get_symbol_loc x15, Ljsimd_idct_islow_neon_consts
+ mov x10, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
+ ld1 {v0.8h, v1.8h}, [x15]
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
+ ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
+ ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
+ ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
+
+ cmeq v16.8h, v3.8h, #0
+ cmeq v26.8h, v4.8h, #0
+ cmeq v27.8h, v5.8h, #0
+ cmeq v28.8h, v6.8h, #0
+ cmeq v29.8h, v7.8h, #0
+ cmeq v30.8h, v8.8h, #0
+ cmeq v31.8h, v9.8h, #0
+
+ and v10.16b, v16.16b, v26.16b
+ and v11.16b, v27.16b, v28.16b
+ and v12.16b, v29.16b, v30.16b
+ and v13.16b, v31.16b, v10.16b
+ and v14.16b, v11.16b, v12.16b
+ mul v2.8h, v2.8h, v18.8h
+ and v15.16b, v13.16b, v14.16b
+ shl v10.8h, v2.8h, #(PASS1_BITS)
+ sqxtn v16.8b, v15.8h
+ mov TMP1, v16.d[0]
+ mvn TMP2, TMP1
+
+ cbnz TMP2, 2f
+ /* case all AC coeffs are zeros */
+ dup v2.2d, v10.d[0]
+ dup v6.2d, v10.d[1]
+ mov v3.16b, v2.16b
+ mov v7.16b, v6.16b
+ mov v4.16b, v2.16b
+ mov v8.16b, v6.16b
+ mov v5.16b, v2.16b
+ mov v9.16b, v6.16b
+1:
+ /* for this transpose, we should organise data like this:
+ * 00, 01, 02, 03, 40, 41, 42, 43
+ * 10, 11, 12, 13, 50, 51, 52, 53
+ * 20, 21, 22, 23, 60, 61, 62, 63
+ * 30, 31, 32, 33, 70, 71, 72, 73
+ * 04, 05, 06, 07, 44, 45, 46, 47
+ * 14, 15, 16, 17, 54, 55, 56, 57
+ * 24, 25, 26, 27, 64, 65, 66, 67
+ * 34, 35, 36, 37, 74, 75, 76, 77
+ */
+ trn1 v28.8h, v2.8h, v3.8h
+ trn1 v29.8h, v4.8h, v5.8h
+ trn1 v30.8h, v6.8h, v7.8h
+ trn1 v31.8h, v8.8h, v9.8h
+ trn2 v16.8h, v2.8h, v3.8h
+ trn2 v17.8h, v4.8h, v5.8h
+ trn2 v18.8h, v6.8h, v7.8h
+ trn2 v19.8h, v8.8h, v9.8h
+ trn1 v2.4s, v28.4s, v29.4s
+ trn1 v6.4s, v30.4s, v31.4s
+ trn1 v3.4s, v16.4s, v17.4s
+ trn1 v7.4s, v18.4s, v19.4s
+ trn2 v4.4s, v28.4s, v29.4s
+ trn2 v8.4s, v30.4s, v31.4s
+ trn2 v5.4s, v16.4s, v17.4s
+ trn2 v9.4s, v18.4s, v19.4s
+ /* Even part: reverse the even part of the forward DCT. */
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ mov v21.16b, v19.16b /* tmp3 = z1 */
+ mov v20.16b, v18.16b /* tmp3 = z1 */
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
+
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
+
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
+
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
+
+ shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+ shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+ shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+ movi v0.16b, #(CENTERJSAMPLE)
+ /* Prepare pointers (dual-issue with Neon instructions) */
+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
+ sqrshrn v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ ldp TMP3, TMP4, [OUTPUT_BUF], 16
+ sqrshrn v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ add TMP1, TMP1, OUTPUT_COL
+ sqrshrn v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ add TMP2, TMP2, OUTPUT_COL
+ sqrshrn v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ add TMP3, TMP3, OUTPUT_COL
+ sqrshrn2 v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ add TMP4, TMP4, OUTPUT_COL
+ sqrshrn2 v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ ldp TMP5, TMP6, [OUTPUT_BUF], 16
+ sqrshrn2 v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ ldp TMP7, TMP8, [OUTPUT_BUF], 16
+ sqrshrn2 v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
+ add TMP5, TMP5, OUTPUT_COL
+ add v16.16b, v28.16b, v0.16b
+ add TMP6, TMP6, OUTPUT_COL
+ add v18.16b, v29.16b, v0.16b
+ add TMP7, TMP7, OUTPUT_COL
+ add v20.16b, v30.16b, v0.16b
+ add TMP8, TMP8, OUTPUT_COL
+ add v22.16b, v31.16b, v0.16b
+
+ /* Transpose the final 8-bit samples */
+ trn1 v28.16b, v16.16b, v18.16b
+ trn1 v30.16b, v20.16b, v22.16b
+ trn2 v29.16b, v16.16b, v18.16b
+ trn2 v31.16b, v20.16b, v22.16b
+
+ trn1 v16.8h, v28.8h, v30.8h
+ trn2 v18.8h, v28.8h, v30.8h
+ trn1 v20.8h, v29.8h, v31.8h
+ trn2 v22.8h, v29.8h, v31.8h
+
+ uzp1 v28.4s, v16.4s, v18.4s
+ uzp2 v30.4s, v16.4s, v18.4s
+ uzp1 v29.4s, v20.4s, v22.4s
+ uzp2 v31.4s, v20.4s, v22.4s
+
+ /* Store results to the output buffer */
+ st1 {v28.d}[0], [TMP1]
+ st1 {v29.d}[0], [TMP2]
+ st1 {v28.d}[1], [TMP3]
+ st1 {v29.d}[1], [TMP4]
+ st1 {v30.d}[0], [TMP5]
+ st1 {v31.d}[0], [TMP6]
+ st1 {v30.d}[1], [TMP7]
+ st1 {v31.d}[1], [TMP8]
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+ blr x30
+
+.balign 16
+2:
+ mul v3.8h, v3.8h, v19.8h
+ mul v4.8h, v4.8h, v20.8h
+ mul v5.8h, v5.8h, v21.8h
+ add TMP4, xzr, TMP2, LSL #32
+ mul v6.8h, v6.8h, v22.8h
+ mul v7.8h, v7.8h, v23.8h
+ adds TMP3, xzr, TMP2, LSR #32
+ mul v8.8h, v8.8h, v24.8h
+ mul v9.8h, v9.8h, v25.8h
+ b.ne 3f
+ /* Right AC coef is zero */
+ dup v15.2d, v10.d[1]
+ /* Even part: reverse the even part of the forward DCT. */
+ add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+ add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ mov v20.16b, v18.16b /* tmp3 = z1 */
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */
+
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
+
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
+
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
+
+ rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ mov v6.16b, v15.16b
+ mov v7.16b, v15.16b
+ mov v8.16b, v15.16b
+ mov v9.16b, v15.16b
+ b 1b
+
+.balign 16
+3:
+ cbnz TMP4, 4f
+ /* Left AC coef is zero */
+ dup v14.2d, v10.d[0]
+ /* Even part: reverse the even part of the forward DCT. */
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ mov v21.16b, v19.16b /* tmp3 = z1 */
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
+
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
+
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
+
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
+
+ mov v2.16b, v14.16b
+ mov v3.16b, v14.16b
+ mov v4.16b, v14.16b
+ mov v5.16b, v14.16b
+ rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ b 1b
+
+.balign 16
+4:
+ /* "No" AC coef is zero */
+ /* Even part: reverse the even part of the forward DCT. */
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ mov v21.16b, v19.16b /* tmp3 = z1 */
+ mov v20.16b, v18.16b /* tmp3 = z1 */
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
+
+ /* Odd part per figure 8; the matrix is unitary and hence its
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
+ */
+
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
+
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
+
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
+
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
+
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
+
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
+
+ rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+ rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+ rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+ rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+ b 1b
+
+ .unreq DCT_TABLE
+ .unreq COEF_BLOCK
+ .unreq OUTPUT_BUF
+ .unreq OUTPUT_COL
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq TMP4
+ .unreq TMP5
+ .unreq TMP6
+ .unreq TMP7
+ .unreq TMP8
+
+#undef CENTERJSAMPLE
+#undef CONST_BITS
+#undef PASS1_BITS
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_ycc_extrgb_convert_neon
+ * jsimd_ycc_extbgr_convert_neon
+ * jsimd_ycc_extrgbx_convert_neon
+ * jsimd_ycc_extbgrx_convert_neon
+ * jsimd_ycc_extxbgr_convert_neon
+ * jsimd_ycc_extxrgb_convert_neon
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro do_load size
+ .if \size == 8
+ ld1 {v4.8b}, [U], 8
+ ld1 {v5.8b}, [V], 8
+ ld1 {v0.8b}, [Y], 8
+ prfm pldl1keep, [U, #64]
+ prfm pldl1keep, [V, #64]
+ prfm pldl1keep, [Y, #64]
+ .elseif \size == 4
+ ld1 {v4.b}[0], [U], 1
+ ld1 {v4.b}[1], [U], 1
+ ld1 {v4.b}[2], [U], 1
+ ld1 {v4.b}[3], [U], 1
+ ld1 {v5.b}[0], [V], 1
+ ld1 {v5.b}[1], [V], 1
+ ld1 {v5.b}[2], [V], 1
+ ld1 {v5.b}[3], [V], 1
+ ld1 {v0.b}[0], [Y], 1
+ ld1 {v0.b}[1], [Y], 1
+ ld1 {v0.b}[2], [Y], 1
+ ld1 {v0.b}[3], [Y], 1
+ .elseif \size == 2
+ ld1 {v4.b}[4], [U], 1
+ ld1 {v4.b}[5], [U], 1
+ ld1 {v5.b}[4], [V], 1
+ ld1 {v5.b}[5], [V], 1
+ ld1 {v0.b}[4], [Y], 1
+ ld1 {v0.b}[5], [Y], 1
+ .elseif \size == 1
+ ld1 {v4.b}[6], [U], 1
+ ld1 {v5.b}[6], [V], 1
+ ld1 {v0.b}[6], [Y], 1
+ .else
+ .error unsupported macroblock size
+ .endif
+.endm
+
+.macro do_store bpp, size, fast_st3
+ .if \bpp == 24
+ .if \size == 8
+ .if \fast_st3 == 1
+ st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
+ .else
+ st1 {v10.b}[0], [RGB], #1
+ st1 {v11.b}[0], [RGB], #1
+ st1 {v12.b}[0], [RGB], #1
+
+ st1 {v10.b}[1], [RGB], #1
+ st1 {v11.b}[1], [RGB], #1
+ st1 {v12.b}[1], [RGB], #1
+
+ st1 {v10.b}[2], [RGB], #1
+ st1 {v11.b}[2], [RGB], #1
+ st1 {v12.b}[2], [RGB], #1
+
+ st1 {v10.b}[3], [RGB], #1
+ st1 {v11.b}[3], [RGB], #1
+ st1 {v12.b}[3], [RGB], #1
+
+ st1 {v10.b}[4], [RGB], #1
+ st1 {v11.b}[4], [RGB], #1
+ st1 {v12.b}[4], [RGB], #1
+
+ st1 {v10.b}[5], [RGB], #1
+ st1 {v11.b}[5], [RGB], #1
+ st1 {v12.b}[5], [RGB], #1
+
+ st1 {v10.b}[6], [RGB], #1
+ st1 {v11.b}[6], [RGB], #1
+ st1 {v12.b}[6], [RGB], #1
+
+ st1 {v10.b}[7], [RGB], #1
+ st1 {v11.b}[7], [RGB], #1
+ st1 {v12.b}[7], [RGB], #1
+ .endif
+ .elseif \size == 4
+ st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
+ st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
+ st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
+ st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
+ .elseif \size == 2
+ st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
+ st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
+ .elseif \size == 1
+ st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
+ .else
+ .error unsupported macroblock size
+ .endif
+ .elseif \bpp == 32
+ .if \size == 8
+ st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
+ .elseif \size == 4
+ st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
+ st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
+ st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
+ st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
+ .elseif \size == 2
+ st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
+ st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
+ .elseif \size == 1
+ st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
+ .else
+ .error unsupported macroblock size
+ .endif
+ .elseif \bpp == 16
+ .if \size == 8
+ st1 {v25.8h}, [RGB], 16
+ .elseif \size == 4
+ st1 {v25.4h}, [RGB], 8
+ .elseif \size == 2
+ st1 {v25.h}[4], [RGB], 2
+ st1 {v25.h}[5], [RGB], 2
+ .elseif \size == 1
+ st1 {v25.h}[6], [RGB], 2
+ .else
+ .error unsupported macroblock size
+ .endif
+ .else
+ .error unsupported bpp
+ .endif
+.endm
+
+.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
+ g_offs, gsize, b_offs, bsize, \
+ defsize, fast_st3
+
+/*
+ * 2-stage pipelined YCbCr->RGB conversion
+ */
+
+.macro do_yuv_to_rgb_stage1
+ uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
+ smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
+ smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
+.endm
+
+.macro do_yuv_to_rgb_stage2
+ rshrn v20.4h, v20.4s, #15
+ rshrn2 v20.8h, v22.4s, #15
+ rshrn v24.4h, v24.4s, #14
+ rshrn2 v24.8h, v26.4s, #14
+ rshrn v28.4h, v28.4s, #14
+ rshrn2 v28.8h, v30.4s, #14
+ uaddw v20.8h, v20.8h, v0.8b
+ uaddw v24.8h, v24.8h, v0.8b
+ uaddw v28.8h, v28.8h, v0.8b
+ .if \bpp != 16
+ sqxtun v1\g_offs\defsize, v20.8h
+ sqxtun v1\r_offs\defsize, v24.8h
+ sqxtun v1\b_offs\defsize, v28.8h
+ .else
+ sqshlu v21.8h, v20.8h, #8
+ sqshlu v25.8h, v24.8h, #8
+ sqshlu v29.8h, v28.8h, #8
+ sri v25.8h, v21.8h, #5
+ sri v25.8h, v29.8h, #11
+ .endif
+.endm
+
+.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
+ rshrn v20.4h, v20.4s, #15
+ rshrn v24.4h, v24.4s, #14
+ rshrn v28.4h, v28.4s, #14
+ ld1 {v4.8b}, [U], 8
+ rshrn2 v20.8h, v22.4s, #15
+ rshrn2 v24.8h, v26.4s, #14
+ rshrn2 v28.8h, v30.4s, #14
+ ld1 {v5.8b}, [V], 8
+ uaddw v20.8h, v20.8h, v0.8b
+ uaddw v24.8h, v24.8h, v0.8b
+ uaddw v28.8h, v28.8h, v0.8b
+ .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
+ sqxtun v1\g_offs\defsize, v20.8h
+ ld1 {v0.8b}, [Y], 8
+ sqxtun v1\r_offs\defsize, v24.8h
+ prfm pldl1keep, [U, #64]
+ prfm pldl1keep, [V, #64]
+ prfm pldl1keep, [Y, #64]
+ sqxtun v1\b_offs\defsize, v28.8h
+ uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
+ .else /**************************** rgb565 ********************************/
+ sqshlu v21.8h, v20.8h, #8
+ sqshlu v25.8h, v24.8h, #8
+ sqshlu v29.8h, v28.8h, #8
+ uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
+ ld1 {v0.8b}, [Y], 8
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
+ sri v25.8h, v21.8h, #5
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
+ prfm pldl1keep, [U, #64]
+ prfm pldl1keep, [V, #64]
+ prfm pldl1keep, [Y, #64]
+ sri v25.8h, v29.8h, #11
+ .endif
+ do_store \bpp, 8, \fast_st3
+ smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
+ smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
+.endm
+
+.macro do_yuv_to_rgb
+ do_yuv_to_rgb_stage1
+ do_yuv_to_rgb_stage2
+.endm
+
+.if \fast_st3 == 1
+asm_function jsimd_ycc_\colorid\()_convert_neon
+.else
+asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+.endif
+ OUTPUT_WIDTH .req w0
+ INPUT_BUF .req x1
+ INPUT_ROW .req w2
+ OUTPUT_BUF .req x3
+ NUM_ROWS .req w4
+
+ INPUT_BUF0 .req x5
+ INPUT_BUF1 .req x6
+ INPUT_BUF2 .req x1
+
+ RGB .req x7
+ Y .req x9
+ U .req x10
+ V .req x11
+ N .req w15
+
+ sub sp, sp, 64
+ mov x9, sp
+
+ /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
+ get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts
+
+ /* Save Neon registers */
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+ ld1 {v0.4h, v1.4h}, [x15], 16
+ ld1 {v2.8h}, [x15]
+
+ ldr INPUT_BUF0, [INPUT_BUF]
+ ldr INPUT_BUF1, [INPUT_BUF, #8]
+ ldr INPUT_BUF2, [INPUT_BUF, #16]
+ .unreq INPUT_BUF
+
+ /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
+ movi v10.16b, #255
+ movi v13.16b, #255
+
+ /* Outer loop over scanlines */
+ cmp NUM_ROWS, #1
+ b.lt 9f
+0:
+ ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
+ ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
+ mov N, OUTPUT_WIDTH
+ ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
+ add INPUT_ROW, INPUT_ROW, #1
+ ldr RGB, [OUTPUT_BUF], #8
+
+ /* Inner loop over pixels */
+ subs N, N, #8
+ b.lt 3f
+ do_load 8
+ do_yuv_to_rgb_stage1
+ subs N, N, #8
+ b.lt 2f
+1:
+ do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
+ subs N, N, #8
+ b.ge 1b
+2:
+ do_yuv_to_rgb_stage2
+ do_store \bpp, 8, \fast_st3
+ tst N, #7
+ b.eq 8f
+3:
+ tst N, #4
+ b.eq 3f
+ do_load 4
+3:
+ tst N, #2
+ b.eq 4f
+ do_load 2
+4:
+ tst N, #1
+ b.eq 5f
+ do_load 1
+5:
+ do_yuv_to_rgb
+ tst N, #4
+ b.eq 6f
+ do_store \bpp, 4, \fast_st3
+6:
+ tst N, #2
+ b.eq 7f
+ do_store \bpp, 2, \fast_st3
+7:
+ tst N, #1
+ b.eq 8f
+ do_store \bpp, 1, \fast_st3
+8:
+ subs NUM_ROWS, NUM_ROWS, #1
+ b.gt 0b
+9:
+ /* Restore all registers and return */
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ br x30
+ .unreq OUTPUT_WIDTH
+ .unreq INPUT_ROW
+ .unreq OUTPUT_BUF
+ .unreq NUM_ROWS
+ .unreq INPUT_BUF0
+ .unreq INPUT_BUF1
+ .unreq INPUT_BUF2
+ .unreq RGB
+ .unreq Y
+ .unreq U
+ .unreq V
+ .unreq N
+
+.purgem do_yuv_to_rgb
+.purgem do_yuv_to_rgb_stage1
+.purgem do_yuv_to_rgb_stage2
+.purgem do_yuv_to_rgb_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/
+generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1
+generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
+
+generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0
+generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0
+
+.purgem do_load
+.purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+ .if \size == 8
+ st1 {v20.8b}, [Y], #8
+ st1 {v21.8b}, [U], #8
+ st1 {v22.8b}, [V], #8
+ .elseif \size == 4
+ st1 {v20.b}[0], [Y], #1
+ st1 {v20.b}[1], [Y], #1
+ st1 {v20.b}[2], [Y], #1
+ st1 {v20.b}[3], [Y], #1
+ st1 {v21.b}[0], [U], #1
+ st1 {v21.b}[1], [U], #1
+ st1 {v21.b}[2], [U], #1
+ st1 {v21.b}[3], [U], #1
+ st1 {v22.b}[0], [V], #1
+ st1 {v22.b}[1], [V], #1
+ st1 {v22.b}[2], [V], #1
+ st1 {v22.b}[3], [V], #1
+ .elseif \size == 2
+ st1 {v20.b}[4], [Y], #1
+ st1 {v20.b}[5], [Y], #1
+ st1 {v21.b}[4], [U], #1
+ st1 {v21.b}[5], [U], #1
+ st1 {v22.b}[4], [V], #1
+ st1 {v22.b}[5], [V], #1
+ .elseif \size == 1
+ st1 {v20.b}[6], [Y], #1
+ st1 {v21.b}[6], [U], #1
+ st1 {v22.b}[6], [V], #1
+ .else
+ .error unsupported macroblock size
+ .endif
+.endm
+
+.macro do_load bpp, size, fast_ld3
+ .if \bpp == 24
+ .if \size == 8
+ .if \fast_ld3 == 1
+ ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24
+ .else
+ ld1 {v10.b}[0], [RGB], #1
+ ld1 {v11.b}[0], [RGB], #1
+ ld1 {v12.b}[0], [RGB], #1
+
+ ld1 {v10.b}[1], [RGB], #1
+ ld1 {v11.b}[1], [RGB], #1
+ ld1 {v12.b}[1], [RGB], #1
+
+ ld1 {v10.b}[2], [RGB], #1
+ ld1 {v11.b}[2], [RGB], #1
+ ld1 {v12.b}[2], [RGB], #1
+
+ ld1 {v10.b}[3], [RGB], #1
+ ld1 {v11.b}[3], [RGB], #1
+ ld1 {v12.b}[3], [RGB], #1
+
+ ld1 {v10.b}[4], [RGB], #1
+ ld1 {v11.b}[4], [RGB], #1
+ ld1 {v12.b}[4], [RGB], #1
+
+ ld1 {v10.b}[5], [RGB], #1
+ ld1 {v11.b}[5], [RGB], #1
+ ld1 {v12.b}[5], [RGB], #1
+
+ ld1 {v10.b}[6], [RGB], #1
+ ld1 {v11.b}[6], [RGB], #1
+ ld1 {v12.b}[6], [RGB], #1
+
+ ld1 {v10.b}[7], [RGB], #1
+ ld1 {v11.b}[7], [RGB], #1
+ ld1 {v12.b}[7], [RGB], #1
+ .endif
+ prfm pldl1keep, [RGB, #128]
+ .elseif \size == 4
+ ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3
+ ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3
+ ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3
+ ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3
+ .elseif \size == 2
+ ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3
+ ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3
+ .elseif \size == 1
+ ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3
+ .else
+ .error unsupported macroblock size
+ .endif
+ .elseif \bpp == 32
+ .if \size == 8
+ ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
+ prfm pldl1keep, [RGB, #128]
+ .elseif \size == 4
+ ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
+ ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
+ ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
+ ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
+ .elseif \size == 2
+ ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
+ ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
+ .elseif \size == 1
+ ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
+ .else
+ .error unsupported macroblock size
+ .endif
+ .else
+ .error unsupported bpp
+ .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
+ b_offs, fast_ld3
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+ ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */
+ ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */
+ ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */
+ rev64 v18.4s, v1.4s
+ rev64 v26.4s, v1.4s
+ rev64 v28.4s, v1.4s
+ rev64 v30.4s, v1.4s
+ umull v14.4s, v4.4h, v0.h[0]
+ umull2 v16.4s, v4.8h, v0.h[0]
+ umlsl v18.4s, v4.4h, v0.h[3]
+ umlsl2 v26.4s, v4.8h, v0.h[3]
+ umlal v28.4s, v4.4h, v0.h[5]
+ umlal2 v30.4s, v4.8h, v0.h[5]
+ umlal v14.4s, v6.4h, v0.h[1]
+ umlal2 v16.4s, v6.8h, v0.h[1]
+ umlsl v18.4s, v6.4h, v0.h[4]
+ umlsl2 v26.4s, v6.8h, v0.h[4]
+ umlsl v28.4s, v6.4h, v0.h[6]
+ umlsl2 v30.4s, v6.8h, v0.h[6]
+ umlal v14.4s, v8.4h, v0.h[2]
+ umlal2 v16.4s, v8.8h, v0.h[2]
+ umlal v18.4s, v8.4h, v0.h[5]
+ umlal2 v26.4s, v8.8h, v0.h[5]
+ umlsl v28.4s, v8.4h, v0.h[7]
+ umlsl2 v30.4s, v8.8h, v0.h[7]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+ rshrn v20.4h, v14.4s, #16
+ shrn v22.4h, v18.4s, #16
+ shrn v24.4h, v28.4s, #16
+ rshrn2 v20.8h, v16.4s, #16
+ shrn2 v22.8h, v26.4s, #16
+ shrn2 v24.8h, v30.4s, #16
+ xtn v20.8b, v20.8h /* v20 = y */
+ xtn v21.8b, v22.8h /* v21 = u */
+ xtn v22.8b, v24.8h /* v22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+ do_rgb_to_yuv_stage1
+ do_rgb_to_yuv_stage2
+.endm
+
+/* TODO: expand macros and interleave instructions if some in-order
+ * AArch64 processor actually can dual-issue LOAD/STORE with ALU */
+.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
+ do_rgb_to_yuv_stage2
+ do_load \bpp, 8, \fast_ld3
+ st1 {v20.8b}, [Y], #8
+ st1 {v21.8b}, [U], #8
+ st1 {v22.8b}, [V], #8
+ do_rgb_to_yuv_stage1
+.endm
+
+.if \fast_ld3 == 1
+asm_function jsimd_\colorid\()_ycc_convert_neon
+.else
+asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+.endif
+ OUTPUT_WIDTH .req w0
+ INPUT_BUF .req x1
+ OUTPUT_BUF .req x2
+ OUTPUT_ROW .req w3
+ NUM_ROWS .req w4
+
+ OUTPUT_BUF0 .req x5
+ OUTPUT_BUF1 .req x6
+ OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */
+
+ RGB .req x7
+ Y .req x9
+ U .req x10
+ V .req x11
+ N .req w12
+
+ /* Load constants to d0, d1, d2, d3 */
+ get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts
+ ld1 {v0.8h, v1.8h}, [x13]
+
+ ldr OUTPUT_BUF0, [OUTPUT_BUF]
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #8]
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]
+ .unreq OUTPUT_BUF
+
+ /* Save Neon registers */
+ sub sp, sp, #64
+ mov x9, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+
+ /* Outer loop over scanlines */
+ cmp NUM_ROWS, #1
+ b.lt 9f
+0:
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
+ mov N, OUTPUT_WIDTH
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
+ add OUTPUT_ROW, OUTPUT_ROW, #1
+ ldr RGB, [INPUT_BUF], #8
+
+ /* Inner loop over pixels */
+ subs N, N, #8
+ b.lt 3f
+ do_load \bpp, 8, \fast_ld3
+ do_rgb_to_yuv_stage1
+ subs N, N, #8
+ b.lt 2f
+1:
+ do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
+ subs N, N, #8
+ b.ge 1b
+2:
+ do_rgb_to_yuv_stage2
+ do_store 8
+ tst N, #7
+ b.eq 8f
+3:
+ tbz N, #2, 3f
+ do_load \bpp, 4, \fast_ld3
+3:
+ tbz N, #1, 4f
+ do_load \bpp, 2, \fast_ld3
+4:
+ tbz N, #0, 5f
+ do_load \bpp, 1, \fast_ld3
+5:
+ do_rgb_to_yuv
+ tbz N, #2, 6f
+ do_store 4
+6:
+ tbz N, #1, 7f
+ do_store 2
+7:
+ tbz N, #0, 8f
+ do_store 1
+8:
+ subs NUM_ROWS, NUM_ROWS, #1
+ b.gt 0b
+9:
+ /* Restore all registers and return */
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ br x30
+
+ .unreq OUTPUT_WIDTH
+ .unreq OUTPUT_ROW
+ .unreq INPUT_BUF
+ .unreq NUM_ROWS
+ .unreq OUTPUT_BUF0
+ .unreq OUTPUT_BUF1
+ .unreq OUTPUT_BUF2
+ .unreq RGB
+ .unreq Y
+ .unreq U
+ .unreq V
+ .unreq N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R G B Fast LD3 */
+generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
+
+generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0
+generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0
+
+.purgem do_load
+.purgem do_store
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_fdct_islow_neon
+ *
+ * This file contains a slower but more accurate integer implementation of the
+ * forward DCT (Discrete Cosine Transform). The following code is based
+ * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
+ * more details.
+ *
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
+ * rid of a bunch of VLD1.16 instructions
+ */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+#define XFIX_P_0_298 v0.h[0]
+#define XFIX_N_0_390 v0.h[1]
+#define XFIX_P_0_541 v0.h[2]
+#define XFIX_P_0_765 v0.h[3]
+#define XFIX_N_0_899 v0.h[4]
+#define XFIX_P_1_175 v0.h[5]
+#define XFIX_P_1_501 v0.h[6]
+#define XFIX_N_1_847 v0.h[7]
+#define XFIX_N_1_961 v1.h[0]
+#define XFIX_P_2_053 v1.h[1]
+#define XFIX_N_2_562 v1.h[2]
+#define XFIX_P_3_072 v1.h[3]
+
+asm_function jsimd_fdct_islow_neon
+
+ DATA .req x0
+ TMP .req x9
+
+ /* Load constants */
+ get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts
+ ld1 {v0.8h, v1.8h}, [TMP]
+
+ /* Save Neon registers */
+ sub sp, sp, #64
+ mov x10, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
+
+ /* Load all DATA into Neon registers with the following allocation:
+ * 0 1 2 3 | 4 5 6 7
+ * ---------+--------
+ * 0 | d16 | d17 | v16.8h
+ * 1 | d18 | d19 | v17.8h
+ * 2 | d20 | d21 | v18.8h
+ * 3 | d22 | d23 | v19.8h
+ * 4 | d24 | d25 | v20.8h
+ * 5 | d26 | d27 | v21.8h
+ * 6 | d28 | d29 | v22.8h
+ * 7 | d30 | d31 | v23.8h
+ */
+
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+ sub DATA, DATA, #64
+
+ /* Transpose */
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+ /* 1-D FDCT */
+ add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
+ sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
+ add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
+ sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
+ add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
+ sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
+ add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
+ sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
+
+ /* even part */
+
+ add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
+ sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
+ add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
+ sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
+
+ add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
+ sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
+
+ add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
+
+ shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+ shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+
+ smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+ mov v22.16b, v18.16b
+ mov v25.16b, v24.16b
+
+ smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+ smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+ smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+ smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+ rshrn v18.4h, v18.4s, #DESCALE_P1
+ rshrn v22.4h, v22.4s, #DESCALE_P1
+ rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+ rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+ /* Odd part */
+
+ add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
+ add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
+ add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
+ add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
+ smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
+ smull2 v5.4s, v10.8h, XFIX_P_1_175
+ smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+ smlal2 v5.4s, v11.8h, XFIX_P_1_175
+
+ smull2 v24.4s, v28.8h, XFIX_P_0_298
+ smull2 v25.4s, v29.8h, XFIX_P_2_053
+ smull2 v26.4s, v30.8h, XFIX_P_3_072
+ smull2 v27.4s, v31.8h, XFIX_P_1_501
+ smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+ smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+ smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+ smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+ smull2 v12.4s, v8.8h, XFIX_N_0_899
+ smull2 v13.4s, v9.8h, XFIX_N_2_562
+ smull2 v14.4s, v10.8h, XFIX_N_1_961
+ smull2 v15.4s, v11.8h, XFIX_N_0_390
+ smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+ smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+ smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+ smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
+
+ add v10.4s, v10.4s, v4.4s /* z3 += z5 */
+ add v14.4s, v14.4s, v5.4s
+ add v11.4s, v11.4s, v4.4s /* z4 += z5 */
+ add v15.4s, v15.4s, v5.4s
+
+ add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
+ add v24.4s, v24.4s, v12.4s
+ add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
+ add v25.4s, v25.4s, v13.4s
+ add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
+ add v26.4s, v26.4s, v14.4s
+ add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
+ add v27.4s, v27.4s, v15.4s
+
+ add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
+ add v24.4s, v24.4s, v14.4s
+ add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
+ add v25.4s, v25.4s, v15.4s
+ add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
+ add v26.4s, v26.4s, v13.4s
+ add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
+ add v27.4s, v27.4s, v12.4s
+
+ rshrn v23.4h, v28.4s, #DESCALE_P1
+ rshrn v21.4h, v29.4s, #DESCALE_P1
+ rshrn v19.4h, v30.4s, #DESCALE_P1
+ rshrn v17.4h, v31.4s, #DESCALE_P1
+ rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+ rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+ /* Transpose */
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
+
+ /* 1-D FDCT */
+ add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
+ sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
+ add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
+ sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
+ add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
+ sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
+ add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
+ sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
+
+ /* even part */
+ add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
+ sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
+ add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
+ sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
+
+ add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
+ sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
+
+ add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
+
+ srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
+ srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
+
+ smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
+ mov v22.16b, v18.16b
+ mov v25.16b, v24.16b
+
+ smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+ smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
+ smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+ smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
+
+ rshrn v18.4h, v18.4s, #DESCALE_P2
+ rshrn v22.4h, v22.4s, #DESCALE_P2
+ rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+ rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+
+ /* Odd part */
+ add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
+ add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
+ add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
+ add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
+
+ smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
+ smull2 v5.4s, v10.8h, XFIX_P_1_175
+ smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
+ smlal2 v5.4s, v11.8h, XFIX_P_1_175
+
+ smull2 v24.4s, v28.8h, XFIX_P_0_298
+ smull2 v25.4s, v29.8h, XFIX_P_2_053
+ smull2 v26.4s, v30.8h, XFIX_P_3_072
+ smull2 v27.4s, v31.8h, XFIX_P_1_501
+ smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
+ smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
+ smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
+ smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
+
+ smull2 v12.4s, v8.8h, XFIX_N_0_899
+ smull2 v13.4s, v9.8h, XFIX_N_2_562
+ smull2 v14.4s, v10.8h, XFIX_N_1_961
+ smull2 v15.4s, v11.8h, XFIX_N_0_390
+ smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+ smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+ smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+ smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
+
+ add v10.4s, v10.4s, v4.4s
+ add v14.4s, v14.4s, v5.4s
+ add v11.4s, v11.4s, v4.4s
+ add v15.4s, v15.4s, v5.4s
+
+ add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
+ add v24.4s, v24.4s, v12.4s
+ add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
+ add v25.4s, v25.4s, v13.4s
+ add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
+ add v26.4s, v26.4s, v14.4s
+ add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
+ add v27.4s, v27.4s, v15.4s
+
+ add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
+ add v24.4s, v24.4s, v14.4s
+ add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
+ add v25.4s, v25.4s, v15.4s
+ add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
+ add v26.4s, v26.4s, v13.4s
+ add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
+ add v27.4s, v27.4s, v12.4s
+
+ rshrn v23.4h, v28.4s, #DESCALE_P2
+ rshrn v21.4h, v29.4s, #DESCALE_P2
+ rshrn v19.4h, v30.4s, #DESCALE_P2
+ rshrn v17.4h, v31.4s, #DESCALE_P2
+ rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+ rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+ rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+
+ /* store results */
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+
+ /* Restore Neon registers */
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+
+ br x30
+
+ .unreq DATA
+ .unreq TMP
+
+#undef XFIX_P_0_298
+#undef XFIX_N_0_390
+#undef XFIX_P_0_541
+#undef XFIX_P_0_765
+#undef XFIX_N_0_899
+#undef XFIX_P_1_175
+#undef XFIX_P_1_501
+#undef XFIX_N_1_847
+#undef XFIX_N_1_961
+#undef XFIX_P_2_053
+#undef XFIX_N_2_562
+#undef XFIX_P_3_072
+
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ * JCOEFPTR block, int last_dc_val,
+ * c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+ BUFFER .req x1
+ PUT_BUFFER .req x6
+ PUT_BITS .req x7
+ PUT_BITSw .req w7
+
+.macro emit_byte
+ sub PUT_BITS, PUT_BITS, #0x8
+ lsr x19, PUT_BUFFER, PUT_BITS
+ uxtb w19, w19
+ strb w19, [BUFFER, #1]!
+ cmp w19, #0xff
+ b.ne 14f
+ strb wzr, [BUFFER, #1]!
+14:
+.endm
+.macro put_bits CODE, SIZE
+ lsl PUT_BUFFER, PUT_BUFFER, \SIZE
+ add PUT_BITS, PUT_BITS, \SIZE
+ orr PUT_BUFFER, PUT_BUFFER, \CODE
+.endm
+.macro checkbuf31
+ cmp PUT_BITS, #0x20
+ b.lt 31f
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+31:
+.endm
+.macro checkbuf47
+ cmp PUT_BITS, #0x30
+ b.lt 47f
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+47:
+.endm
+
+.macro generate_jsimd_huff_encode_one_block fast_tbl
+
+.if \fast_tbl == 1
+asm_function jsimd_huff_encode_one_block_neon
+.else
+asm_function jsimd_huff_encode_one_block_neon_slowtbl
+.endif
+ sub sp, sp, 272
+ sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
+ /* Save Arm registers */
+ stp x19, x20, [sp]
+ get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts
+ ldr PUT_BUFFER, [x0, #0x10]
+ ldr PUT_BITSw, [x0, #0x18]
+ ldrsh w12, [x2] /* load DC coeff in w12 */
+ /* prepare data */
+.if \fast_tbl == 1
+ ld1 {v23.16b}, [x15], #16
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
+ ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
+ sub w12, w12, w3 /* last_dc_val, not used afterwards */
+ /* ZigZag 8x8 */
+ tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
+ tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
+ tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
+ tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
+ tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
+ tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
+ tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
+ tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
+ ins v0.h[0], w12
+ tbx v1.16b, {v28.16b}, v16.16b
+ tbx v2.16b, {v29.16b, v30.16b}, v17.16b
+ tbx v5.16b, {v29.16b, v30.16b}, v18.16b
+ tbx v6.16b, {v31.16b}, v19.16b
+.else
+ add x13, x2, #0x22
+ sub w12, w12, w3 /* last_dc_val, not used afterwards */
+ ld1 {v23.16b}, [x15]
+ add x14, x2, #0x18
+ add x3, x2, #0x36
+ ins v0.h[0], w12
+ add x9, x2, #0x2
+ ld1 {v1.h}[0], [x13]
+ add x15, x2, #0x30
+ ld1 {v2.h}[0], [x14]
+ add x19, x2, #0x26
+ ld1 {v3.h}[0], [x3]
+ add x20, x2, #0x28
+ ld1 {v0.h}[1], [x9]
+ add x12, x2, #0x10
+ ld1 {v1.h}[1], [x15]
+ add x13, x2, #0x40
+ ld1 {v2.h}[1], [x19]
+ add x14, x2, #0x34
+ ld1 {v3.h}[1], [x20]
+ add x3, x2, #0x1a
+ ld1 {v0.h}[2], [x12]
+ add x9, x2, #0x20
+ ld1 {v1.h}[2], [x13]
+ add x15, x2, #0x32
+ ld1 {v2.h}[2], [x14]
+ add x19, x2, #0x42
+ ld1 {v3.h}[2], [x3]
+ add x20, x2, #0xc
+ ld1 {v0.h}[3], [x9]
+ add x12, x2, #0x12
+ ld1 {v1.h}[3], [x15]
+ add x13, x2, #0x24
+ ld1 {v2.h}[3], [x19]
+ add x14, x2, #0x50
+ ld1 {v3.h}[3], [x20]
+ add x3, x2, #0xe
+ ld1 {v0.h}[4], [x12]
+ add x9, x2, #0x4
+ ld1 {v1.h}[4], [x13]
+ add x15, x2, #0x16
+ ld1 {v2.h}[4], [x14]
+ add x19, x2, #0x60
+ ld1 {v3.h}[4], [x3]
+ add x20, x2, #0x1c
+ ld1 {v0.h}[5], [x9]
+ add x12, x2, #0x6
+ ld1 {v1.h}[5], [x15]
+ add x13, x2, #0x8
+ ld1 {v2.h}[5], [x19]
+ add x14, x2, #0x52
+ ld1 {v3.h}[5], [x20]
+ add x3, x2, #0x2a
+ ld1 {v0.h}[6], [x12]
+ add x9, x2, #0x14
+ ld1 {v1.h}[6], [x13]
+ add x15, x2, #0xa
+ ld1 {v2.h}[6], [x14]
+ add x19, x2, #0x44
+ ld1 {v3.h}[6], [x3]
+ add x20, x2, #0x38
+ ld1 {v0.h}[7], [x9]
+ add x12, x2, #0x46
+ ld1 {v1.h}[7], [x15]
+ add x13, x2, #0x3a
+ ld1 {v2.h}[7], [x19]
+ add x14, x2, #0x74
+ ld1 {v3.h}[7], [x20]
+ add x3, x2, #0x6a
+ ld1 {v4.h}[0], [x12]
+ add x9, x2, #0x54
+ ld1 {v5.h}[0], [x13]
+ add x15, x2, #0x2c
+ ld1 {v6.h}[0], [x14]
+ add x19, x2, #0x76
+ ld1 {v7.h}[0], [x3]
+ add x20, x2, #0x78
+ ld1 {v4.h}[1], [x9]
+ add x12, x2, #0x62
+ ld1 {v5.h}[1], [x15]
+ add x13, x2, #0x1e
+ ld1 {v6.h}[1], [x19]
+ add x14, x2, #0x68
+ ld1 {v7.h}[1], [x20]
+ add x3, x2, #0x7a
+ ld1 {v4.h}[2], [x12]
+ add x9, x2, #0x70
+ ld1 {v5.h}[2], [x13]
+ add x15, x2, #0x2e
+ ld1 {v6.h}[2], [x14]
+ add x19, x2, #0x5a
+ ld1 {v7.h}[2], [x3]
+ add x20, x2, #0x6c
+ ld1 {v4.h}[3], [x9]
+ add x12, x2, #0x72
+ ld1 {v5.h}[3], [x15]
+ add x13, x2, #0x3c
+ ld1 {v6.h}[3], [x19]
+ add x14, x2, #0x4c
+ ld1 {v7.h}[3], [x20]
+ add x3, x2, #0x5e
+ ld1 {v4.h}[4], [x12]
+ add x9, x2, #0x64
+ ld1 {v5.h}[4], [x13]
+ add x15, x2, #0x4a
+ ld1 {v6.h}[4], [x14]
+ add x19, x2, #0x3e
+ ld1 {v7.h}[4], [x3]
+ add x20, x2, #0x6e
+ ld1 {v4.h}[5], [x9]
+ add x12, x2, #0x56
+ ld1 {v5.h}[5], [x15]
+ add x13, x2, #0x58
+ ld1 {v6.h}[5], [x19]
+ add x14, x2, #0x4e
+ ld1 {v7.h}[5], [x20]
+ add x3, x2, #0x7c
+ ld1 {v4.h}[6], [x12]
+ add x9, x2, #0x48
+ ld1 {v5.h}[6], [x13]
+ add x15, x2, #0x66
+ ld1 {v6.h}[6], [x14]
+ add x19, x2, #0x5c
+ ld1 {v7.h}[6], [x3]
+ add x20, x2, #0x7e
+ ld1 {v4.h}[7], [x9]
+ ld1 {v5.h}[7], [x15]
+ ld1 {v6.h}[7], [x19]
+ ld1 {v7.h}[7], [x20]
+.endif
+ cmlt v24.8h, v0.8h, #0
+ cmlt v25.8h, v1.8h, #0
+ cmlt v26.8h, v2.8h, #0
+ cmlt v27.8h, v3.8h, #0
+ cmlt v28.8h, v4.8h, #0
+ cmlt v29.8h, v5.8h, #0
+ cmlt v30.8h, v6.8h, #0
+ cmlt v31.8h, v7.8h, #0
+ abs v0.8h, v0.8h
+ abs v1.8h, v1.8h
+ abs v2.8h, v2.8h
+ abs v3.8h, v3.8h
+ abs v4.8h, v4.8h
+ abs v5.8h, v5.8h
+ abs v6.8h, v6.8h
+ abs v7.8h, v7.8h
+ eor v24.16b, v24.16b, v0.16b
+ eor v25.16b, v25.16b, v1.16b
+ eor v26.16b, v26.16b, v2.16b
+ eor v27.16b, v27.16b, v3.16b
+ eor v28.16b, v28.16b, v4.16b
+ eor v29.16b, v29.16b, v5.16b
+ eor v30.16b, v30.16b, v6.16b
+ eor v31.16b, v31.16b, v7.16b
+ cmeq v16.8h, v0.8h, #0
+ cmeq v17.8h, v1.8h, #0
+ cmeq v18.8h, v2.8h, #0
+ cmeq v19.8h, v3.8h, #0
+ cmeq v20.8h, v4.8h, #0
+ cmeq v21.8h, v5.8h, #0
+ cmeq v22.8h, v6.8h, #0
+ xtn v16.8b, v16.8h
+ xtn v18.8b, v18.8h
+ xtn v20.8b, v20.8h
+ xtn v22.8b, v22.8h
+ umov w14, v0.h[0]
+ xtn2 v16.16b, v17.8h
+ umov w13, v24.h[0]
+ xtn2 v18.16b, v19.8h
+ clz w14, w14
+ xtn2 v20.16b, v21.8h
+ lsl w13, w13, w14
+ cmeq v17.8h, v7.8h, #0
+ sub w12, w14, #32
+ xtn2 v22.16b, v17.8h
+ lsr w13, w13, w14
+ and v16.16b, v16.16b, v23.16b
+ neg w12, w12
+ and v18.16b, v18.16b, v23.16b
+ add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
+ and v20.16b, v20.16b, v23.16b
+ add x15, sp, #0x90 /* x15 = t2 */
+ and v22.16b, v22.16b, v23.16b
+ ldr w10, [x4, x12, lsl #2]
+ addp v16.16b, v16.16b, v18.16b
+ ldrb w11, [x3, x12]
+ addp v20.16b, v20.16b, v22.16b
+ checkbuf47
+ addp v16.16b, v16.16b, v20.16b
+ put_bits x10, x11
+ addp v16.16b, v16.16b, v18.16b
+ checkbuf47
+ umov x9, v16.D[0]
+ put_bits x13, x12
+ cnt v17.8b, v16.8b
+ mvn x9, x9
+ addv B18, v17.8b
+ add x4, x5, #0x400 /* x4 = actbl->ehufsi */
+ umov w12, v18.b[0]
+ lsr x9, x9, #0x1 /* clear AC coeff */
+ ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */
+ rbit x9, x9 /* x9 = index0 */
+ ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
+ cmp w12, #(64-8)
+ add x11, sp, #16
+ b.lt 4f
+ cbz x9, 6f
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+ clz x2, x9
+ add x15, x15, x2, lsl #1
+ lsl x9, x9, x2
+ ldrh w20, [x15, #-126]
+2:
+ cmp x2, #0x10
+ b.lt 3f
+ sub x2, x2, #0x10
+ checkbuf47
+ put_bits x13, x14
+ b 2b
+3:
+ clz w20, w20
+ ldrh w3, [x15, #2]!
+ sub w11, w20, #32
+ lsl w3, w3, w20
+ neg w11, w11
+ lsr w3, w3, w20
+ add x2, x11, x2, lsl #4
+ lsl x9, x9, #0x1
+ ldr w12, [x5, x2, lsl #2]
+ ldrb w10, [x4, x2]
+ checkbuf31
+ put_bits x12, x10
+ put_bits x3, x11
+ cbnz x9, 1b
+ b 6f
+4:
+ movi v21.8h, #0x0010
+ clz v0.8h, v0.8h
+ clz v1.8h, v1.8h
+ clz v2.8h, v2.8h
+ clz v3.8h, v3.8h
+ clz v4.8h, v4.8h
+ clz v5.8h, v5.8h
+ clz v6.8h, v6.8h
+ clz v7.8h, v7.8h
+ ushl v24.8h, v24.8h, v0.8h
+ ushl v25.8h, v25.8h, v1.8h
+ ushl v26.8h, v26.8h, v2.8h
+ ushl v27.8h, v27.8h, v3.8h
+ ushl v28.8h, v28.8h, v4.8h
+ ushl v29.8h, v29.8h, v5.8h
+ ushl v30.8h, v30.8h, v6.8h
+ ushl v31.8h, v31.8h, v7.8h
+ neg v0.8h, v0.8h
+ neg v1.8h, v1.8h
+ neg v2.8h, v2.8h
+ neg v3.8h, v3.8h
+ neg v4.8h, v4.8h
+ neg v5.8h, v5.8h
+ neg v6.8h, v6.8h
+ neg v7.8h, v7.8h
+ ushl v24.8h, v24.8h, v0.8h
+ ushl v25.8h, v25.8h, v1.8h
+ ushl v26.8h, v26.8h, v2.8h
+ ushl v27.8h, v27.8h, v3.8h
+ ushl v28.8h, v28.8h, v4.8h
+ ushl v29.8h, v29.8h, v5.8h
+ ushl v30.8h, v30.8h, v6.8h
+ ushl v31.8h, v31.8h, v7.8h
+ add v0.8h, v21.8h, v0.8h
+ add v1.8h, v21.8h, v1.8h
+ add v2.8h, v21.8h, v2.8h
+ add v3.8h, v21.8h, v3.8h
+ add v4.8h, v21.8h, v4.8h
+ add v5.8h, v21.8h, v5.8h
+ add v6.8h, v21.8h, v6.8h
+ add v7.8h, v21.8h, v7.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+ clz x2, x9
+ add x15, x15, x2, lsl #1
+ lsl x9, x9, x2
+ ldrh w11, [x15, #-126]
+2:
+ cmp x2, #0x10
+ b.lt 3f
+ sub x2, x2, #0x10
+ checkbuf47
+ put_bits x13, x14
+ b 2b
+3:
+ ldrh w3, [x15, #2]!
+ add x2, x11, x2, lsl #4
+ lsl x9, x9, #0x1
+ ldr w12, [x5, x2, lsl #2]
+ ldrb w10, [x4, x2]
+ checkbuf31
+ put_bits x12, x10
+ put_bits x3, x11
+ cbnz x9, 1b
+6:
+ add x13, sp, #0x10e
+ cmp x15, x13
+ b.hs 1f
+ ldr w12, [x5]
+ ldrb w14, [x4]
+ checkbuf47
+ put_bits x12, x14
+1:
+ str PUT_BUFFER, [x0, #0x10]
+ str PUT_BITSw, [x0, #0x18]
+ ldp x19, x20, [sp], 16
+ add x0, BUFFER, #0x1
+ add sp, sp, 256
+ br x30
+
+.endm
+
+generate_jsimd_huff_encode_one_block 1
+generate_jsimd_huff_encode_one_block 0
+
+ .unreq BUFFER
+ .unreq PUT_BUFFER
+ .unreq PUT_BITS
+ .unreq PUT_BITSw
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf31
+.purgem checkbuf47
diff --git a/media/libjpeg/simd/arm/align.h b/media/libjpeg/simd/arm/align.h
new file mode 100644
index 0000000000..cff4241e84
--- /dev/null
+++ b/media/libjpeg/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment) __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment) __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/arm/jccolor-neon.c b/media/libjpeg/simd/arm/jccolor-neon.c
new file mode 100644
index 0000000000..9fcc62dd25
--- /dev/null
+++ b/media/libjpeg/simd/arm/jccolor-neon.c
@@ -0,0 +1,160 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> YCbCr conversion constants */
+
+#define F_0_298 19595
+#define F_0_587 38470
+#define F_0_113 7471
+#define F_0_168 11059
+#define F_0_331 21709
+#define F_0_500 32768
+#define F_0_418 27439
+#define F_0_081 5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+ F_0_298, F_0_587, F_0_113, F_0_168,
+ F_0_331, F_0_500, F_0_418, F_0_081
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgray-neon.c b/media/libjpeg/simd/arm/jcgray-neon.c
new file mode 100644
index 0000000000..71c7b2de21
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgray-neon.c
@@ -0,0 +1,120 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> Grayscale conversion constants */
+
+#define F_0_298 19595
+#define F_0_587 38470
+#define F_0_113 7471
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgryext-neon.c b/media/libjpeg/simd/arm/jcgryext-neon.c
new file mode 100644
index 0000000000..416a7385df
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgryext-neon.c
@@ -0,0 +1,106 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+
+/* RGB -> Grayscale conversion is defined by the following equation:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ JSAMPROW inptr;
+ JSAMPROW outptr;
+ /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining > 0; cols_remaining -= 16) {
+
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 16) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
+ if (cols_remaining < 16) {
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+ }
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+ uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+ uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+ uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+ y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+ y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+ y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+ y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+ y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+ y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+ y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+ y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+
+ /* Narrow Y values to 8-bit and store to memory. Buffer overwrite is
+ * permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
+ vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+ /* Increment pointers. */
+ inptr += (16 * RGB_PIXELSIZE);
+ outptr += 16;
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/jchuff.h b/media/libjpeg/simd/arm/jchuff.h
new file mode 100644
index 0000000000..2fbd252b9b
--- /dev/null
+++ b/media/libjpeg/simd/arm/jchuff.h
@@ -0,0 +1,131 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020-2021, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define BIT_BUF_SIZE 64
+#else
+#define BIT_BUF_SIZE 32
+#endif
+
+typedef struct {
+ size_t put_buffer; /* current bit accumulation buffer */
+ int free_bits; /* # of bits available in it */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+ JOCTET *next_output_byte; /* => next byte to write in buffer */
+ size_t free_in_buffer; /* # of byte spaces remaining in buffer */
+ savable_state cur; /* Current bit buffer & DC state */
+ j_compress_ptr cinfo; /* dump_buffer needs access to this */
+ int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF. Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+ buffer[0] = (JOCTET)(b); \
+ buffer[1] = 0; \
+ buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
+ * directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define FLUSH() { \
+ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+ EMIT_BYTE(put_buffer >> 56) \
+ EMIT_BYTE(put_buffer >> 48) \
+ EMIT_BYTE(put_buffer >> 40) \
+ EMIT_BYTE(put_buffer >> 32) \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
+ buffer += 8; \
+ } \
+}
+
+#else
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+ buffer[0] = (JOCTET)(put_buffer >> 24); \
+ buffer[1] = (JOCTET)(put_buffer >> 16); \
+ buffer[2] = (JOCTET)(put_buffer >> 8); \
+ buffer[3] = (JOCTET)(put_buffer ); \
+ buffer += 4; \
+}
+#else
+#define SPLAT() { \
+ put_buffer = __builtin_bswap32(put_buffer); \
+ __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
+}
+#endif
+
+#define FLUSH() { \
+ if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ SPLAT(); \
+ } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+ put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+ FLUSH() \
+ free_bits += BIT_BUF_SIZE; \
+ put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+ free_bits -= size; \
+ if (free_bits < 0) \
+ PUT_AND_FLUSH(code, size) \
+ else \
+ put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+ diff |= code << nbits; \
+ nbits += size; \
+ PUT_BITS(diff, nbits) \
+}
diff --git a/media/libjpeg/simd/arm/jcphuff-neon.c b/media/libjpeg/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000000..b91c5db478
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcphuff-neon.c
@@ -0,0 +1,622 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits)
+{
+ JCOEF *values_ptr = values;
+ JCOEF *diff_values_ptr = values + DCTSIZE2;
+
+ /* Rows of coefficients to zero (since they haven't been processed) */
+ int i, rows_to_zero = 8;
+
+ for (i = 0; i < Sl / 16; i++) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+ int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+ int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs1);
+ vst1q_s16(values_ptr + DCTSIZE, coefs2);
+ vst1q_s16(diff_values_ptr, diff1);
+ vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+ values_ptr += 16;
+ diff_values_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+ }
+
+ /* Same operation but for remaining partial vector */
+ int remaining_coefs = Sl % 16;
+ if (remaining_coefs > 8) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vdupq_n_s16(0);
+ switch (remaining_coefs) {
+ case 15:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+ int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+ int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs1);
+ vst1q_s16(values_ptr + DCTSIZE, coefs2);
+ vst1q_s16(diff_values_ptr, diff1);
+ vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+ values_ptr += 16;
+ diff_values_ptr += 16;
+ rows_to_zero -= 2;
+
+ } else if (remaining_coefs > 0) {
+ int16x8_t coefs = vdupq_n_s16(0);
+
+ switch (remaining_coefs) {
+ case 8:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 7:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs = vabsq_s16(coefs);
+ coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff = veorq_s16(coefs, sign_coefs);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs);
+ vst1q_s16(diff_values_ptr, diff);
+ values_ptr += 8;
+ diff_values_ptr += 8;
+ rows_to_zero--;
+ }
+
+ /* Zero remaining memory in the values and diff_values blocks. */
+ for (i = 0; i < rows_to_zero; i++) {
+ vst1q_s16(values_ptr, vdupq_n_s16(0));
+ vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+ values_ptr += 8;
+ diff_values_ptr += 8;
+ }
+
+ /* Construct zerobits bitmap. A set bit means that the corresponding
+ * coefficient != 0.
+ */
+ int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
+ int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
+
+ uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
+ uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
+ uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
+ uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
+ uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
+ uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
+ uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
+ uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+
+ /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+ row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+ row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+ row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+ row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+ row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+ row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+ row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+ row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+ uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+ uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+ uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+ uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store zerobits bitmap. */
+ *zerobits = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store zerobits bitmap. */
+ zerobits[0] = ~bitmap0;
+ zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits)
+{
+ /* Temporary storage buffers for data used to compute the signbits bitmap and
+ * the end-of-block (EOB) position
+ */
+ uint8_t coef_sign_bits[64];
+ uint8_t coef_eq1_bits[64];
+
+ JCOEF *absvalues_ptr = absvalues;
+ uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+ uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+ /* Rows of coefficients to zero (since they haven't been processed) */
+ int i, rows_to_zero = 8;
+
+ for (i = 0; i < Sl / 16; i++) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs1 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+ uint8x8_t sign_coefs2 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+ vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs1);
+ vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq11);
+ vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+ absvalues_ptr += 16;
+ coef_sign_bits_ptr += 16;
+ eq1_bits_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+ }
+
+ /* Same operation but for remaining partial vector */
+ int remaining_coefs = Sl % 16;
+ if (remaining_coefs > 8) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vdupq_n_s16(0);
+ switch (remaining_coefs) {
+ case 15:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs1 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+ uint8x8_t sign_coefs2 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+ vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs1);
+ vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq11);
+ vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+ absvalues_ptr += 16;
+ coef_sign_bits_ptr += 16;
+ eq1_bits_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+
+ } else if (remaining_coefs > 0) {
+ int16x8_t coefs = vdupq_n_s16(0);
+
+ switch (remaining_coefs) {
+ case 8:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 7:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs = vabsq_s16(coefs);
+ coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+ absvalues_ptr += 8;
+ coef_sign_bits_ptr += 8;
+ eq1_bits_ptr += 8;
+ rows_to_zero--;
+ }
+
+ /* Zero remaining memory in blocks. */
+ for (i = 0; i < rows_to_zero; i++) {
+ vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+ vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+ vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+ absvalues_ptr += 8;
+ coef_sign_bits_ptr += 8;
+ eq1_bits_ptr += 8;
+ }
+
+ /* Construct zerobits bitmap. */
+ int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
+ int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
+ int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
+ int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
+ int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
+ int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
+ int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
+ int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
+
+ uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
+ uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
+ uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
+ uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
+ uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
+ uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
+ uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
+ uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+
+ /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+ abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+ abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+ abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+ abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+ abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+ abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+ abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+ abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+ uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+ uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+ uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+ uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store zerobits bitmap. */
+ bits[0] = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store zerobits bitmap. */
+ bits[0] = ~bitmap0;
+ bits[1] = ~bitmap1;
+#endif
+
+ /* Construct signbits bitmap. */
+ uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+ uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+ uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+ uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+ uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+ uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+ uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+ uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+ signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+ signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+ signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+ signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+ signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+ signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+ signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+ signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+ bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+ bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+ bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+ bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+ bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store signbits bitmap. */
+ bits[1] = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store signbits bitmap. */
+ bits[2] = ~bitmap0;
+ bits[3] = ~bitmap1;
+#endif
+
+ /* Construct bitmap to find EOB position (the index of the last coefficient
+ * equal to 1.)
+ */
+ uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+ uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+ uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+ uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+ uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+ uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+ uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+ uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+ row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+ row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+ row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+ row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+ row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+ row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+ row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+ row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+ bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+ bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+ bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+ bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+ bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+ /* Return EOB position. */
+ if (bitmap == 0) {
+ /* EOB position is defined to be 0 if all coefficients != 1. */
+ return 0;
+ } else {
+ return 63 - BUILTIN_CLZLL(bitmap);
+ }
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+ /* Return EOB position. */
+ if (bitmap0 == 0 && bitmap1 == 0) {
+ return 0;
+ } else if (bitmap1 != 0) {
+ return 63 - BUILTIN_CLZ(bitmap1);
+ } else {
+ return 31 - BUILTIN_CLZ(bitmap0);
+ }
+#endif
+}
diff --git a/media/libjpeg/simd/arm/jcsample-neon.c b/media/libjpeg/simd/arm/jcsample-neon.c
new file mode 100644
index 0000000000..8a3e237838
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcsample-neon.c
@@ -0,0 +1,192 @@
+/*
+ * jcsample-neon.c - downsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
+ 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
+ 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
+ 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
+ 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+ 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ JSAMPROW inptr, outptr;
+ /* Load expansion mask to pad remaining elements of last DCT block. */
+ const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+ const uint8x16_t expand_mask =
+ vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern (alternating every pixel.) */
+ /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+ const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
+ unsigned i, outrow;
+
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr = input_data[outrow];
+
+ /* Downsample all but the last DCT block of pixels. */
+ for (i = 0; i < width_in_blocks - 1; i++) {
+ uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+ /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+ /* Divide total by 2 and narrow to 8-bit. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+ /* Store samples to memory. */
+ vst1_u8(outptr + i * DCTSIZE, samples_u8);
+ }
+
+ /* Load pixels in last DCT block into a table. */
+ uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Pad the empty elements with the value of the last pixel. */
+ pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+ uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
+ pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+ vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+ /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+ /* Divide total by 2, narrow to 8-bit, and store. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+ vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+ }
+}
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ JSAMPROW inptr0, inptr1, outptr;
+ /* Load expansion mask to pad remaining elements of last DCT block. */
+ const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+ const uint8x16_t expand_mask =
+ vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern (alternating every pixel.) */
+ /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+ const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
+ unsigned i, outrow;
+
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr0 = input_data[outrow];
+ inptr1 = input_data[outrow + 1];
+
+ /* Downsample all but the last DCT block of pixels. */
+ for (i = 0; i < width_in_blocks - 1; i++) {
+ uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+ /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+ /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+ */
+ samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+ /* Divide total by 4 and narrow to 8-bit. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+ /* Store samples to memory and increment pointers. */
+ vst1_u8(outptr + i * DCTSIZE, samples_u8);
+ }
+
+ /* Load pixels in last DCT block into a table. */
+ uint8x16_t pixels_r0 =
+ vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 =
+ vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Pad the empty elements with the value of the last pixel. */
+ pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+ pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+ uint8x8x2_t table_r0 =
+ { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+ uint8x8x2_t table_r1 =
+ { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
+ pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+ vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+ pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+ vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+ /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+ /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
+ samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+ /* Divide total by 4, narrow to 8-bit, and store. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+ vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+ }
+}
diff --git a/media/libjpeg/simd/arm/jdcolext-neon.c b/media/libjpeg/simd/arm/jdcolext-neon.c
new file mode 100644
index 0000000000..c3c07a1964
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolext-neon.c
@@ -0,0 +1,374 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+
+/* YCbCr -> RGB conversion is defined by the following equations:
+ * R = Y + 1.40200 * (Cr - 128)
+ * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ * B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.3441467 = 11277 * 2^-15
+ * 0.7141418 = 23401 * 2^-15
+ * 1.4020386 = 22971 * 2^-14
+ * 1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ JSAMPROW outptr;
+ /* Pointers to Y, Cb, and Cr data */
+ JSAMPROW inptr0, inptr1, inptr2;
+
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ uint8x16_t y = vld1q_u8(inptr0);
+ uint8x16_t cb = vld1q_u8(inptr1);
+ uint8x16_t cr = vld1q_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_low_u8(cr)));
+ int16x8_t cr_128_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_high_u8(cr)));
+ int16x8_t cb_128_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_low_u8(cb)));
+ int16x8_t cb_128_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_high_u8(cb)));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+ int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+ consts, 0);
+ int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+ int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+ consts, 0);
+ g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+ consts, 1);
+ g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+ consts, 1);
+ g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+ consts, 1);
+ g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+ consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+ vrshrn_n_s32(g_sub_y_lh, 15));
+ int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+ vrshrn_n_s32(g_sub_y_hh, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+ consts, 2);
+ int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+ consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+ consts, 3);
+ int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+ consts, 3);
+ /* Add Y. */
+ int16x8_t r_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t r_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+ vget_high_u8(y)));
+ int16x8_t b_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t b_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+ vget_high_u8(y)));
+ int16x8_t g_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t g_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+ vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+ rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+ rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+ uint8x16x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+ rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+ rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr, rgb);
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
+ uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+ rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+ rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+ uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+ rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+ rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+ /* Store RGB pixel data to memory. */
+ vst1q_u16((uint16_t *)outptr, rgb565_l);
+ vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif
+
+ /* Increment pointers. */
+ inptr0 += 16;
+ inptr1 += 16;
+ inptr2 += 16;
+ outptr += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining >= 8) {
+ uint8x8_t y = vld1_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+ consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+ consts, 3);
+ /* Add Y. */
+ int16x8_t r =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+ int16x8_t b =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+ int16x8_t g =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vqmovun_s16(r);
+ rgba.val[RGB_GREEN] = vqmovun_s16(g);
+ rgba.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+ uint8x8x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vqmovun_s16(r);
+ rgb.val[RGB_GREEN] = vqmovun_s16(g);
+ rgb.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Store RGB pixel data to memory. */
+ vst3_u8(outptr, rgb);
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
+ uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+ /* Store RGB pixel data to memory. */
+ vst1q_u16((uint16_t *)outptr, rgb565);
+#endif
+
+ /* Increment pointers. */
+ inptr0 += 8;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr += (RGB_PIXELSIZE * 8);
+ cols_remaining -= 8;
+ }
+
+ /* Handle the tail elements. */
+ if (cols_remaining > 0) {
+ uint8x8_t y = vld1_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+ consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+ consts, 3);
+ /* Add Y. */
+ int16x8_t r =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+ int16x8_t b =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+ int16x8_t g =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vqmovun_s16(r);
+ rgba.val[RGB_GREEN] = vqmovun_s16(g);
+ rgba.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 7:
+ vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst4_lane_u8(outptr, rgba, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#elif RGB_PIXELSIZE == 3
+ uint8x8x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vqmovun_s16(r);
+ rgb.val[RGB_GREEN] = vqmovun_s16(g);
+ rgb.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 7:
+ vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst3_lane_u8(outptr, rgb, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
+ uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+ /* Store RGB565 pixel data to memory. */
+ switch (cols_remaining) {
+ case 7:
+ vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#endif
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/jdcolor-neon.c b/media/libjpeg/simd/arm/jdcolor-neon.c
new file mode 100644
index 0000000000..ea4668f1d3
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolor-neon.c
@@ -0,0 +1,142 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+ -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE 2
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
diff --git a/media/libjpeg/simd/arm/jdmerge-neon.c b/media/libjpeg/simd/arm/jdmerge-neon.c
new file mode 100644
index 0000000000..e4f91fdc0e
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmerge-neon.c
@@ -0,0 +1,145 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+ -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
diff --git a/media/libjpeg/simd/arm/jdmrgext-neon.c b/media/libjpeg/simd/arm/jdmrgext-neon.c
new file mode 100644
index 0000000000..5b89bdb339
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmrgext-neon.c
@@ -0,0 +1,723 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ * R = Y + 1.40200 * (Cr - 128)
+ * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ * B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.3441467 = 11277 * 2^-15
+ * 0.7141418 = 23401 * 2^-15
+ * 1.4020386 = 22971 * 2^-14
+ * 1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr;
+ /* Pointers to Y, Cb, and Cr data */
+ JSAMPROW inptr0, inptr1, inptr2;
+
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ /* De-interleave Y component values into two separate vectors, one
+ * containing the component values with even-numbered indices and one
+ * containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y = vld2_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+ * "odd" Y component values. This effectively upsamples the chroma
+ * components horizontally.
+ */
+ int16x8_t g_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[0]));
+ int16x8_t r_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[0]));
+ int16x8_t b_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[0]));
+ int16x8_t g_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[1]));
+ int16x8_t r_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[1]));
+ int16x8_t b_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+ uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+ uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+ uint8x16x4_t rgba;
+ rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+ rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+ rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr, rgba);
+#else
+ uint8x16x3_t rgb;
+ rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+ rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+ rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr, rgb);
+#endif
+
+ /* Increment pointers. */
+ inptr0 += 16;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining > 0) {
+ /* De-interleave Y component values into two separate vectors, one
+ * containing the component values with even-numbered indices and one
+ * containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y = vld2_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+ * "odd" Y component values. This effectively upsamples the chroma
+ * components horizontally.
+ */
+ int16x8_t g_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[0]));
+ int16x8_t r_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[0]));
+ int16x8_t b_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[0]));
+ int16x8_t g_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[1]));
+ int16x8_t r_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[1]));
+ int16x8_t b_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+ uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+ uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+ uint8x8x4_t rgba_h;
+ rgba_h.val[RGB_RED] = r.val[1];
+ rgba_h.val[RGB_GREEN] = g.val[1];
+ rgba_h.val[RGB_BLUE] = b.val[1];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ uint8x8x4_t rgba_l;
+ rgba_l.val[RGB_RED] = r.val[0];
+ rgba_l.val[RGB_GREEN] = g.val[0];
+ rgba_l.val[RGB_BLUE] = b.val[0];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst4_u8(outptr, rgba_l);
+ break;
+ case 7:
+ vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst4_lane_u8(outptr, rgba_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#else
+ uint8x8x3_t rgb_h;
+ rgb_h.val[RGB_RED] = r.val[1];
+ rgb_h.val[RGB_GREEN] = g.val[1];
+ rgb_h.val[RGB_BLUE] = b.val[1];
+ uint8x8x3_t rgb_l;
+ rgb_l.val[RGB_RED] = r.val[0];
+ rgb_l.val[RGB_GREEN] = g.val[0];
+ rgb_l.val[RGB_BLUE] = b.val[0];
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst3_u8(outptr, rgb_l);
+ break;
+ case 7:
+ vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst3_lane_u8(outptr, rgb_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#endif
+ }
+}
+
+
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ *
+ * See comments above for details regarding color conversion and safe memory
+ * access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr0, outptr1;
+ /* Pointers to Y (both rows), Cb, and Cr data */
+ JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+ inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr0 = output_buf[0];
+ outptr1 = output_buf[1];
+
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ /* For each row, de-interleave Y component values into two separate
+ * vectors, one containing the component values with even-numbered indices
+ * and one containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y0 = vld2_u8(inptr0_0);
+ uint8x8x2_t y1 = vld2_u8(inptr0_1);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+ * the "even" and "odd" Y component values. This effectively upsamples the
+ * chroma components both horizontally and vertically.
+ */
+ int16x8_t g0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[0]));
+ int16x8_t r0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[0]));
+ int16x8_t b0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[0]));
+ int16x8_t g0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[1]));
+ int16x8_t r0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[1]));
+ int16x8_t b0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[1]));
+ int16x8_t g1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[0]));
+ int16x8_t r1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[0]));
+ int16x8_t b1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[0]));
+ int16x8_t g1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[1]));
+ int16x8_t r1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[1]));
+ int16x8_t b1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+ uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+ uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+ uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+ uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+ uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+ uint8x16x4_t rgba0, rgba1;
+ rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+ rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+ rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+ rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+ rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+ rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr0, rgba0);
+ vst4q_u8(outptr1, rgba1);
+#else
+ uint8x16x3_t rgb0, rgb1;
+ rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+ rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+ rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+ rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+ rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+ rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr0, rgb0);
+ vst3q_u8(outptr1, rgb1);
+#endif
+
+ /* Increment pointers. */
+ inptr0_0 += 16;
+ inptr0_1 += 16;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr0 += (RGB_PIXELSIZE * 16);
+ outptr1 += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining > 0) {
+ /* For each row, de-interleave Y component values into two separate
+ * vectors, one containing the component values with even-numbered indices
+ * and one containing the component values with odd-numbered indices.
+ */
+ uint8x8x2_t y0 = vld2_u8(inptr0_0);
+ uint8x8x2_t y1 = vld2_u8(inptr0_1);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+ * the "even" and "odd" Y component values. This effectively upsamples the
+ * chroma components both horizontally and vertically.
+ */
+ int16x8_t g0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[0]));
+ int16x8_t r0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[0]));
+ int16x8_t b0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[0]));
+ int16x8_t g0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[1]));
+ int16x8_t r0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[1]));
+ int16x8_t b0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[1]));
+ int16x8_t g1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[0]));
+ int16x8_t r1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[0]));
+ int16x8_t b1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[0]));
+ int16x8_t g1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[1]));
+ int16x8_t r1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[1]));
+ int16x8_t b1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
+ uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+ uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+ uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+ uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+ uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+ uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+ uint8x8x4_t rgba0_h, rgba1_h;
+ rgba0_h.val[RGB_RED] = r0.val[1];
+ rgba1_h.val[RGB_RED] = r1.val[1];
+ rgba0_h.val[RGB_GREEN] = g0.val[1];
+ rgba1_h.val[RGB_GREEN] = g1.val[1];
+ rgba0_h.val[RGB_BLUE] = b0.val[1];
+ rgba1_h.val[RGB_BLUE] = b1.val[1];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+ uint8x8x4_t rgba0_l, rgba1_l;
+ rgba0_l.val[RGB_RED] = r0.val[0];
+ rgba1_l.val[RGB_RED] = r1.val[0];
+ rgba0_l.val[RGB_GREEN] = g0.val[0];
+ rgba1_l.val[RGB_GREEN] = g1.val[0];
+ rgba0_l.val[RGB_BLUE] = b0.val[0];
+ rgba1_l.val[RGB_BLUE] = b1.val[0];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+ vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+ vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+ vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+ vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+ vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+ vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+ vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst4_u8(outptr0, rgba0_l);
+ vst4_u8(outptr1, rgba1_l);
+ break;
+ case 7:
+ vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+ vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+ vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+ vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+ vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+ vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+ vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst4_lane_u8(outptr0, rgba0_l, 0);
+ vst4_lane_u8(outptr1, rgba1_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#else
+ uint8x8x3_t rgb0_h, rgb1_h;
+ rgb0_h.val[RGB_RED] = r0.val[1];
+ rgb1_h.val[RGB_RED] = r1.val[1];
+ rgb0_h.val[RGB_GREEN] = g0.val[1];
+ rgb1_h.val[RGB_GREEN] = g1.val[1];
+ rgb0_h.val[RGB_BLUE] = b0.val[1];
+ rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+ uint8x8x3_t rgb0_l, rgb1_l;
+ rgb0_l.val[RGB_RED] = r0.val[0];
+ rgb1_l.val[RGB_RED] = r1.val[0];
+ rgb0_l.val[RGB_GREEN] = g0.val[0];
+ rgb1_l.val[RGB_GREEN] = g1.val[0];
+ rgb0_l.val[RGB_BLUE] = b0.val[0];
+ rgb1_l.val[RGB_BLUE] = b1.val[0];
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 15:
+ vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+ vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 14:
+ vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+ vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 13:
+ vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+ vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 12:
+ vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+ vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 11:
+ vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+ vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 10:
+ vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+ vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 9:
+ vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+ vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 8:
+ vst3_u8(outptr0, rgb0_l);
+ vst3_u8(outptr1, rgb1_l);
+ break;
+ case 7:
+ vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+ vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 6:
+ vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+ vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 5:
+ vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+ vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 4:
+ vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+ vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 3:
+ vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+ vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 2:
+ vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+ vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+ FALLTHROUGH /*FALLTHROUGH*/
+ case 1:
+ vst3_lane_u8(outptr0, rgb0_l, 0);
+ vst3_lane_u8(outptr1, rgb1_l, 0);
+ FALLTHROUGH /*FALLTHROUGH*/
+ default:
+ break;
+ }
+#endif
+ }
+}
diff --git a/media/libjpeg/simd/arm/jdsample-neon.c b/media/libjpeg/simd/arm/jdsample-neon.c
new file mode 100644
index 0000000000..90ec6782c4
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ * s0 s1 s2
+ * +---------+---------+---------+
+ * | | | |
+ * | p0 p1 | p2 p3 | p4 p5 |
+ * | | | |
+ * +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above. To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1. For example:
+ * p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ * p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ * p0(upsampled) = s0
+ * p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ /* First pixel component value in this row of the original image */
+ *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+ /* 3/4 * containing sample + 1/4 * nearest neighboring sample
+ * For p1: containing sample = s0, nearest neighboring sample = s1
+ * For p2: containing sample = s1, nearest neighboring sample = s0
+ */
+ uint8x16_t s0 = vld1q_u8(inptr);
+ uint8x16_t s1 = vld1q_u8(inptr + 1);
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ uint16x8_t s1_add_3s0_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+ uint16x8_t s1_add_3s0_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+ uint16x8_t s0_add_3s1_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+ uint16x8_t s0_add_3s1_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+ /* Add ordered dithering bias to odd pixel values. */
+ s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+ s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+ /* The offset is initially 1, because the first pixel component has already
+ * been stored. However, in subsequent iterations of the SIMD loop, this
+ * offset is (2 * colctr - 1) to stay within the bounds of the sample
+ * buffers without having to resort to a slow scalar tail case for the last
+ * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
+ * in jmemmgr.c for more details.
+ */
+ unsigned outptr_offset = 1;
+ uint8x16x2_t output_pixels;
+
+ /* We use software pipelining to maximise performance. The code indented
+ * an extra two spaces begins the next iteration of the loop.
+ */
+ for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+ s0 = vld1q_u8(inptr + colctr - 1);
+ s1 = vld1q_u8(inptr + colctr);
+
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+ vrshrn_n_u16(s1_add_3s0_h, 2));
+ output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+ vshrn_n_u16(s0_add_3s1_h, 2));
+
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ s1_add_3s0_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+ s1_add_3s0_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+ s0_add_3s1_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+ s0_add_3s1_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+ /* Add ordered dithering bias to odd pixel values. */
+ s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+ s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr + outptr_offset, output_pixels);
+ outptr_offset = 2 * colctr - 1;
+ }
+
+ /* Complete the last iteration of the loop. */
+
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+ vrshrn_n_u16(s1_add_3s0_h, 2));
+ output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+ vshrn_n_u16(s0_add_3s1_h, 2));
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr + outptr_offset, output_pixels);
+
+ /* Last pixel component value in this row of the original image */
+ outptr[2 * downsampled_width - 1] =
+ GETJSAMPLE(inptr[downsampled_width - 1]);
+ }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ * s0 s1 s2
+ * +---------+---------+---------+
+ * | p0 p1 | p2 p3 | p4 p5 |
+ * sA | | | |
+ * | p6 p7 | p8 p9 | p10 p11|
+ * +---------+---------+---------+
+ * | p12 p13| p14 p15| p16 p17|
+ * sB | | | |
+ * | p18 p19| p20 p21| p22 p23|
+ * +---------+---------+---------+
+ * | p24 p25| p26 p27| p28 p29|
+ * sC | | | |
+ * | p30 p31| p32 p33| p34 p35|
+ * +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above. To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1. For example:
+ * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ * 1/4 * (3/4 * s0B + 1/4 * s0A)
+ * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ * p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ * p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ * p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ * p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ * p0(upsampled) = s0A
+ * p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t seven_u16 = vdupq_n_u16(7);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+ const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+ * respectively.
+ */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ /* First pixel component value in this row of the original image */
+ int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+ *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+ int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+ *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+ /* Step 1: Blend samples vertically in columns s0 and s1.
+ * Leave the divide by 4 until the end, when it can be done for both
+ * dimensions at once, right-shifting by 4.
+ */
+
+ /* Load and compute s0colsum0 and s0colsum1. */
+ uint8x16_t s0A = vld1q_u8(inptr0);
+ uint8x16_t s0B = vld1q_u8(inptr1);
+ uint8x16_t s0C = vld1q_u8(inptr2);
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+ vget_low_u8(s0B), three_u8);
+ uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+ vget_high_u8(s0B), three_u8);
+ uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+ vget_low_u8(s0B), three_u8);
+ uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+ vget_high_u8(s0B), three_u8);
+ /* Load and compute s1colsum0 and s1colsum1. */
+ uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+ uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+ uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+ uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+ vget_low_u8(s1B), three_u8);
+ uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+ vget_high_u8(s1B), three_u8);
+ uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+ vget_low_u8(s1B), three_u8);
+ uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+ vget_high_u8(s1B), three_u8);
+
+ /* Step 2: Blend the already-blended columns. */
+
+ uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+ uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+ uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+ uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+ uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+ uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+ uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+ uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+ /* Add ordered dithering bias to odd pixel values. */
+ output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+ output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+ output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+ output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+ /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+ uint8x16x2_t output_pixels0 = { {
+ vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+ vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+ } };
+ uint8x16x2_t output_pixels1 = { {
+ vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+ vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+ } };
+
+ /* Store pixel component values to memory.
+ * The minimum size of the output buffer for each row is 64 bytes => no
+ * need to worry about buffer overflow here. See "Creation of 2-D sample
+ * arrays" in jmemmgr.c for more details.
+ */
+ vst2q_u8(outptr0 + 1, output_pixels0);
+ vst2q_u8(outptr1 + 1, output_pixels1);
+
+ /* The first pixel of the image shifted our loads and stores by one byte.
+ * We have to re-align on a 32-byte boundary at some point before the end
+ * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+ * bounds of the sample buffers without having to resort to a slow scalar
+ * tail case for the last (downsampled_width % 16) samples. See "Creation
+ * of 2-D sample arrays" in jmemmgr.c for more details.
+ */
+ for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+ /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+ /* Load and compute s0colsum0 and s0colsum1. */
+ s0A = vld1q_u8(inptr0 + colctr - 1);
+ s0B = vld1q_u8(inptr1 + colctr - 1);
+ s0C = vld1q_u8(inptr2 + colctr - 1);
+ s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+ three_u8);
+ s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+ three_u8);
+ s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+ three_u8);
+ s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+ three_u8);
+ /* Load and compute s1colsum0 and s1colsum1. */
+ s1A = vld1q_u8(inptr0 + colctr);
+ s1B = vld1q_u8(inptr1 + colctr);
+ s1C = vld1q_u8(inptr2 + colctr);
+ s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+ three_u8);
+ s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+ three_u8);
+ s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+ three_u8);
+ s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+ three_u8);
+
+ /* Step 2: Blend the already-blended columns. */
+
+ output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+ output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+ output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+ output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+ output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+ output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+ output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+ output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+ /* Add ordered dithering bias to odd pixel values. */
+ output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+ output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+ output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+ output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+ /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+ output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+ vshrn_n_u16(output0_p1_h, 4));
+ output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+ vrshrn_n_u16(output0_p2_h, 4));
+ output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+ vshrn_n_u16(output1_p1_h, 4));
+ output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+ vrshrn_n_u16(output1_p2_h, 4));
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+ vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+ }
+
+ /* Last pixel component value in this row of the original image */
+ int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+ GETJSAMPLE(inptr0[downsampled_width - 1]);
+ outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+ int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+ GETJSAMPLE(inptr2[downsampled_width - 1]);
+ outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+ inrow++;
+ }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ * +---------+
+ * | p0 |
+ * sA | |
+ * | p1 |
+ * +---------+
+ * | p2 |
+ * sB | |
+ * | p3 |
+ * +---------+
+ * | p4 |
+ * sC | |
+ * | p5 |
+ * +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above. To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1. For example:
+ * p1(upsampled) = 3/4 * sA + 1/4 * sB
+ * p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ * p0(upsampled) = sA
+ * p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+ * respectively.
+ */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+ inrow++;
+
+ /* The size of the input and output buffers is always a multiple of 32
+ * bytes => no need to worry about buffer overflow when reading/writing
+ * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
+ * details.
+ */
+ for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+ /* Load samples. */
+ uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+ uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+ uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+ /* Blend samples vertically. */
+ uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+ vget_low_u8(sB), three_u8);
+ uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+ vget_high_u8(sB), three_u8);
+ uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+ vget_low_u8(sB), three_u8);
+ uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+ vget_high_u8(sB), three_u8);
+ /* Add ordered dithering bias to pixel values in even output rows. */
+ colsum0_l = vaddq_u16(colsum0_l, one_u16);
+ colsum0_h = vaddq_u16(colsum0_h, one_u16);
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+ vshrn_n_u16(colsum0_h, 2));
+ uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+ vrshrn_n_u16(colsum1_h, 2));
+ /* Store pixel component values to memory. */
+ vst1q_u8(outptr0 + colctr, output_pixels0);
+ vst1q_u8(outptr1 + colctr, output_pixels1);
+ }
+ }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ * s0 s1
+ * +---------+---------+
+ * | | |
+ * | p0 p1 | p2 p3 |
+ * | | |
+ * +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above. To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ * p0(upsampled) = p1(upsampled) = s0
+ * p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow;
+ unsigned colctr;
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+ uint8x16_t samples = vld1q_u8(inptr + colctr);
+ /* Duplicate the samples. The store operation below interleaves them so
+ * that adjacent pixel component values take on the same sample value,
+ * per above.
+ */
+ uint8x16x2_t output_pixels = { { samples, samples } };
+ /* Store pixel component values to memory.
+ * Due to the way sample buffers are allocated, we don't need to worry
+ * about tail cases when output_width is not a multiple of 32. See
+ * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+ */
+ vst2q_u8(outptr + 2 * colctr, output_pixels);
+ }
+ }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ * s0 s1
+ * +---------+---------+
+ * | p0 p1 | p2 p3 |
+ * sA | | |
+ * | p4 p5 | p6 p7 |
+ * +---------+---------+
+ * | p8 p9 | p10 p11|
+ * sB | | |
+ * | p12 p13| p14 p15|
+ * +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above. To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+ uint8x16_t samples = vld1q_u8(inptr + colctr);
+ /* Duplicate the samples. The store operation below interleaves them so
+ * that adjacent pixel component values take on the same sample value,
+ * per above.
+ */
+ uint8x16x2_t output_pixels = { { samples, samples } };
+ /* Store pixel component values for both output rows to memory.
+ * Due to the way sample buffers are allocated, we don't need to worry
+ * about tail cases when output_width is not a multiple of 32. See
+ * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+ */
+ vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+ vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+ }
+ }
+}
diff --git a/media/libjpeg/simd/arm/jfdctfst-neon.c b/media/libjpeg/simd/arm/jfdctfst-neon.c
new file mode 100644
index 0000000000..bb371be399
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctfst-neon.c
@@ -0,0 +1,214 @@
+/*
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples. It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.382683433 = 12544 * 2^-15
+ * 0.541196100 = 17795 * 2^-15
+ * 0.707106781 = 23168 * 2^-15
+ * 0.306562965 = 9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the DCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
+ */
+
+#define F_0_382 12544
+#define F_0_541 17792
+#define F_0_707 23168
+#define F_0_306 9984
+
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+ F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
+ * are used, followed by vuzp to transpose the block such that we have a
+ * column of samples per vector - allowing all rows to be processed at once.
+ */
+ int16x8x4_t data1 = vld4q_s16(data);
+ int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+ int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+ int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+ int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+ int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+ int16x8_t col0 = cols_04.val[0];
+ int16x8_t col1 = cols_15.val[0];
+ int16x8_t col2 = cols_26.val[0];
+ int16x8_t col3 = cols_37.val[0];
+ int16x8_t col4 = cols_04.val[1];
+ int16x8_t col5 = cols_15.val[1];
+ int16x8_t col6 = cols_26.val[1];
+ int16x8_t col7 = cols_37.val[1];
+
+ /* Pass 1: process rows. */
+
+ /* Load DCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+ int16x8_t tmp0 = vaddq_s16(col0, col7);
+ int16x8_t tmp7 = vsubq_s16(col0, col7);
+ int16x8_t tmp1 = vaddq_s16(col1, col6);
+ int16x8_t tmp6 = vsubq_s16(col1, col6);
+ int16x8_t tmp2 = vaddq_s16(col2, col5);
+ int16x8_t tmp5 = vsubq_s16(col2, col5);
+ int16x8_t tmp3 = vaddq_s16(col3, col4);
+ int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
+ int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+ int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+ int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+ col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
+ col4 = vsubq_s16(tmp10, tmp11);
+
+ int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+ col2 = vaddq_s16(tmp13, z1); /* phase 5 */
+ col6 = vsubq_s16(tmp13, z1);
+
+ /* Odd part */
+ tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
+ tmp11 = vaddq_s16(tmp5, tmp6);
+ tmp12 = vaddq_s16(tmp6, tmp7);
+
+ int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+ int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+ z2 = vaddq_s16(z2, z5);
+ int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+ z5 = vaddq_s16(tmp12, z5);
+ z4 = vaddq_s16(z4, z5);
+ int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+ int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */
+ int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+ col5 = vaddq_s16(z13, z2); /* phase 6 */
+ col3 = vsubq_s16(z13, z2);
+ col1 = vaddq_s16(z11, z4);
+ col7 = vsubq_s16(z11, z4);
+
+ /* Transpose to work on columns in pass 2. */
+ int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+ int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+ int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+ int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+ int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+ vreinterpretq_s32_s16(cols_45.val[0]));
+ int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+ vreinterpretq_s32_s16(cols_45.val[1]));
+ int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+ vreinterpretq_s32_s16(cols_67.val[0]));
+ int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+ vreinterpretq_s32_s16(cols_67.val[1]));
+
+ int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+ int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+ int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+ int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+ int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+ int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+ int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+ int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+ int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+ int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+ int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+ int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+ /* Pass 2: process columns. */
+
+ tmp0 = vaddq_s16(row0, row7);
+ tmp7 = vsubq_s16(row0, row7);
+ tmp1 = vaddq_s16(row1, row6);
+ tmp6 = vsubq_s16(row1, row6);
+ tmp2 = vaddq_s16(row2, row5);
+ tmp5 = vsubq_s16(row2, row5);
+ tmp3 = vaddq_s16(row3, row4);
+ tmp4 = vsubq_s16(row3, row4);
+
+ /* Even part */
+ tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
+ tmp13 = vsubq_s16(tmp0, tmp3);
+ tmp11 = vaddq_s16(tmp1, tmp2);
+ tmp12 = vsubq_s16(tmp1, tmp2);
+
+ row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
+ row4 = vsubq_s16(tmp10, tmp11);
+
+ z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+ row2 = vaddq_s16(tmp13, z1); /* phase 5 */
+ row6 = vsubq_s16(tmp13, z1);
+
+ /* Odd part */
+ tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
+ tmp11 = vaddq_s16(tmp5, tmp6);
+ tmp12 = vaddq_s16(tmp6, tmp7);
+
+ z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+ z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+ z2 = vaddq_s16(z2, z5);
+ z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+ z5 = vaddq_s16(tmp12, z5);
+ z4 = vaddq_s16(z4, z5);
+ z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+ z11 = vaddq_s16(tmp7, z3); /* phase 5 */
+ z13 = vsubq_s16(tmp7, z3);
+
+ row5 = vaddq_s16(z13, z2); /* phase 6 */
+ row3 = vsubq_s16(z13, z2);
+ row1 = vaddq_s16(z11, z4);
+ row7 = vsubq_s16(z11, z4);
+
+ vst1q_s16(data + 0 * DCTSIZE, row0);
+ vst1q_s16(data + 1 * DCTSIZE, row1);
+ vst1q_s16(data + 2 * DCTSIZE, row2);
+ vst1q_s16(data + 3 * DCTSIZE, row3);
+ vst1q_s16(data + 4 * DCTSIZE, row4);
+ vst1q_s16(data + 5 * DCTSIZE, row5);
+ vst1q_s16(data + 6 * DCTSIZE, row6);
+ vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jfdctint-neon.c b/media/libjpeg/simd/arm/jfdctint-neon.c
new file mode 100644
index 0000000000..ccfc07b15d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctint-neon.c
@@ -0,0 +1,376 @@
+/*
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples. It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.298631336 = 2446 * 2^-13
+ * 0.390180644 = 3196 * 2^-13
+ * 0.541196100 = 4433 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.175875602 = 9633 * 2^-13
+ * 1.501321110 = 12299 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 1.961570560 = 16069 * 2^-13
+ * 2.053119869 = 16819 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ * 3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
+ */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+#define F_0_298 2446
+#define F_0_390 3196
+#define F_0_541 4433
+#define F_0_765 6270
+#define F_0_899 7373
+#define F_1_175 9633
+#define F_1_501 12299
+#define F_1_847 15137
+#define F_1_961 16069
+#define F_2_053 16819
+#define F_2_562 20995
+#define F_3_072 25172
+
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+ F_0_298, -F_0_390, F_0_541, F_0_765,
+ -F_0_899, F_1_175, F_1_501, -F_1_847,
+ -F_1_961, F_2_053, -F_2_562, F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+ /* Load DCT constants. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+ const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
+ * are used, followed by vuzp to transpose the block such that we have a
+ * column of samples per vector - allowing all rows to be processed at once.
+ */
+ int16x8x4_t s_rows_0123 = vld4q_s16(data);
+ int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+ int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+ int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+ int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+ int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+ int16x8_t col0 = cols_04.val[0];
+ int16x8_t col1 = cols_15.val[0];
+ int16x8_t col2 = cols_26.val[0];
+ int16x8_t col3 = cols_37.val[0];
+ int16x8_t col4 = cols_04.val[1];
+ int16x8_t col5 = cols_15.val[1];
+ int16x8_t col6 = cols_26.val[1];
+ int16x8_t col7 = cols_37.val[1];
+
+ /* Pass 1: process rows. */
+
+ int16x8_t tmp0 = vaddq_s16(col0, col7);
+ int16x8_t tmp7 = vsubq_s16(col0, col7);
+ int16x8_t tmp1 = vaddq_s16(col1, col6);
+ int16x8_t tmp6 = vsubq_s16(col1, col6);
+ int16x8_t tmp2 = vaddq_s16(col2, col5);
+ int16x8_t tmp5 = vsubq_s16(col2, col5);
+ int16x8_t tmp3 = vaddq_s16(col3, col4);
+ int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+ int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+ int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+ int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+ col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+ col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+ int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+ int32x4_t z1_l =
+ vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+ int32x4_t z1_h =
+ vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+ int32x4_t col2_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+ int32x4_t col2_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+ col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+ vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+ int32x4_t col6_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+ int32x4_t col6_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+ col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+ vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+ /* Odd part */
+ int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+ int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+ int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+ int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+ /* sqrt(2) * c3 */
+ int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+ int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+ z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+ z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+ /* sqrt(2) * (-c1+c3+c5-c7) */
+ int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+ int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+ /* sqrt(2) * ( c1+c3-c5+c7) */
+ int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+ int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+ /* sqrt(2) * ( c1+c3+c5-c7) */
+ int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+ int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+ /* sqrt(2) * ( c1+c3-c5-c7) */
+ int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+ int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+ /* sqrt(2) * (c7-c3) */
+ z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+ z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+ /* sqrt(2) * (-c1-c3) */
+ int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+ int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+ /* sqrt(2) * (-c3-c5) */
+ int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+ int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+ /* sqrt(2) * (c5-c3) */
+ int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+ int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+ z3_l = vaddq_s32(z3_l, z5_l);
+ z3_h = vaddq_s32(z3_h, z5_h);
+ z4_l = vaddq_s32(z4_l, z5_l);
+ z4_h = vaddq_s32(z4_h, z5_h);
+
+ tmp4_l = vaddq_s32(tmp4_l, z1_l);
+ tmp4_h = vaddq_s32(tmp4_h, z1_h);
+ tmp4_l = vaddq_s32(tmp4_l, z3_l);
+ tmp4_h = vaddq_s32(tmp4_h, z3_h);
+ col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+ vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+ tmp5_l = vaddq_s32(tmp5_l, z2_l);
+ tmp5_h = vaddq_s32(tmp5_h, z2_h);
+ tmp5_l = vaddq_s32(tmp5_l, z4_l);
+ tmp5_h = vaddq_s32(tmp5_h, z4_h);
+ col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+ vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+ tmp6_l = vaddq_s32(tmp6_l, z2_l);
+ tmp6_h = vaddq_s32(tmp6_h, z2_h);
+ tmp6_l = vaddq_s32(tmp6_l, z3_l);
+ tmp6_h = vaddq_s32(tmp6_h, z3_h);
+ col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+ vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+ tmp7_l = vaddq_s32(tmp7_l, z1_l);
+ tmp7_h = vaddq_s32(tmp7_h, z1_h);
+ tmp7_l = vaddq_s32(tmp7_l, z4_l);
+ tmp7_h = vaddq_s32(tmp7_h, z4_h);
+ col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+ vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+ /* Transpose to work on columns in pass 2. */
+ int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+ int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+ int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+ int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+ int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+ vreinterpretq_s32_s16(cols_45.val[0]));
+ int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+ vreinterpretq_s32_s16(cols_45.val[1]));
+ int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+ vreinterpretq_s32_s16(cols_67.val[0]));
+ int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+ vreinterpretq_s32_s16(cols_67.val[1]));
+
+ int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+ int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+ int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+ int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+ int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+ int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+ int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+ int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+ int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+ int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+ int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+ int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+ /* Pass 2: process columns. */
+
+ tmp0 = vaddq_s16(row0, row7);
+ tmp7 = vsubq_s16(row0, row7);
+ tmp1 = vaddq_s16(row1, row6);
+ tmp6 = vsubq_s16(row1, row6);
+ tmp2 = vaddq_s16(row2, row5);
+ tmp5 = vsubq_s16(row2, row5);
+ tmp3 = vaddq_s16(row3, row4);
+ tmp4 = vsubq_s16(row3, row4);
+
+ /* Even part */
+ tmp10 = vaddq_s16(tmp0, tmp3);
+ tmp13 = vsubq_s16(tmp0, tmp3);
+ tmp11 = vaddq_s16(tmp1, tmp2);
+ tmp12 = vsubq_s16(tmp1, tmp2);
+
+ row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+ row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+ tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+ z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+ z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+ int32x4_t row2_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+ int32x4_t row2_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+ row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+ vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+ int32x4_t row6_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+ int32x4_t row6_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+ row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+ vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+ /* Odd part */
+ z1 = vaddq_s16(tmp4, tmp7);
+ z2 = vaddq_s16(tmp5, tmp6);
+ z3 = vaddq_s16(tmp4, tmp6);
+ z4 = vaddq_s16(tmp5, tmp7);
+ /* sqrt(2) * c3 */
+ z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+ z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+ z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+ z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+ /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+ tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+ /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+ tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+ /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+ tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+ /* sqrt(2) * ( c1+c3-c5-c7) */
+ tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+ tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+ /* sqrt(2) * (c7-c3) */
+ z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+ z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+ /* sqrt(2) * (-c1-c3) */
+ z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+ z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+ /* sqrt(2) * (-c3-c5) */
+ z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+ z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+ /* sqrt(2) * (c5-c3) */
+ z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+ z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+ z3_l = vaddq_s32(z3_l, z5_l);
+ z3_h = vaddq_s32(z3_h, z5_h);
+ z4_l = vaddq_s32(z4_l, z5_l);
+ z4_h = vaddq_s32(z4_h, z5_h);
+
+ tmp4_l = vaddq_s32(tmp4_l, z1_l);
+ tmp4_h = vaddq_s32(tmp4_h, z1_h);
+ tmp4_l = vaddq_s32(tmp4_l, z3_l);
+ tmp4_h = vaddq_s32(tmp4_h, z3_h);
+ row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+ vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+ tmp5_l = vaddq_s32(tmp5_l, z2_l);
+ tmp5_h = vaddq_s32(tmp5_h, z2_h);
+ tmp5_l = vaddq_s32(tmp5_l, z4_l);
+ tmp5_h = vaddq_s32(tmp5_h, z4_h);
+ row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+ vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+ tmp6_l = vaddq_s32(tmp6_l, z2_l);
+ tmp6_h = vaddq_s32(tmp6_h, z2_h);
+ tmp6_l = vaddq_s32(tmp6_l, z3_l);
+ tmp6_h = vaddq_s32(tmp6_h, z3_h);
+ row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+ vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+ tmp7_l = vaddq_s32(tmp7_l, z1_l);
+ tmp7_h = vaddq_s32(tmp7_h, z1_h);
+ tmp7_l = vaddq_s32(tmp7_l, z4_l);
+ tmp7_h = vaddq_s32(tmp7_h, z4_h);
+ row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+ vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+ vst1q_s16(data + 0 * DCTSIZE, row0);
+ vst1q_s16(data + 1 * DCTSIZE, row1);
+ vst1q_s16(data + 2 * DCTSIZE, row2);
+ vst1q_s16(data + 3 * DCTSIZE, row3);
+ vst1q_s16(data + 4 * DCTSIZE, row4);
+ vst1q_s16(data + 5 * DCTSIZE, row5);
+ vst1q_s16(data + 6 * DCTSIZE, row6);
+ vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jidctfst-neon.c b/media/libjpeg/simd/arm/jidctfst-neon.c
new file mode 100644
index 0000000000..a91be5362e
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctfst-neon.c
@@ -0,0 +1,472 @@
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.082392200 = 2688 * 2^-15
+ * 0.414213562 = 13568 * 2^-15
+ * 0.847759065 = 27776 * 2^-15
+ * 0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS 2
+
+#define F_0_082 2688
+#define F_0_414 13568
+#define F_0_847 27776
+#define F_0_613 20096
+
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+ F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ IFAST_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values for DC coefficients. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ /* Dequantize DC coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+
+ /* Construct bitmap to test if all AC coefficients are 0. */
+ int16x8_t bitmap = vorrq_s16(row1, row2);
+ bitmap = vorrq_s16(bitmap, row3);
+ bitmap = vorrq_s16(bitmap, row4);
+ bitmap = vorrq_s16(bitmap, row5);
+ bitmap = vorrq_s16(bitmap, row6);
+ bitmap = vorrq_s16(bitmap, row7);
+
+ int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+ int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+ /* Load IDCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+ if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+ /* All AC coefficients are zero.
+ * Compute DC values and duplicate into vectors.
+ */
+ int16x8_t dcval = row0;
+ row1 = dcval;
+ row2 = dcval;
+ row3 = dcval;
+ row4 = dcval;
+ row5 = dcval;
+ row6 = dcval;
+ row7 = dcval;
+ } else if (left_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 0, 1, 2, and 3.
+ * Use DC values for these columns.
+ */
+ int16x4_t dcval = vget_low_s16(row0);
+
+ /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x4_t tmp0 = vget_high_s16(row0);
+ int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+ int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+ int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+ int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
+ int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+ int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+ int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+ tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsub_s16(tmp12, tmp13);
+
+ tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsub_s16(tmp10, tmp13);
+ tmp1 = vadd_s16(tmp11, tmp12);
+ tmp2 = vsub_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+ int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+ int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+ int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+ int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
+ int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+ int16x4_t z11 = vadd_s16(tmp4, tmp7);
+ int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+ tmp7 = vadd_s16(z11, z13); /* phase 5 */
+ int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+ tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+ int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+ int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+ z5 = vadd_s16(z5, z10_add_z12);
+ tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+ tmp10 = vadd_s16(tmp10, z12);
+ tmp10 = vsub_s16(tmp10, z5);
+ tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+ tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+ tmp12 = vadd_s16(tmp12, z5);
+
+ tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsub_s16(tmp11, tmp6);
+ tmp4 = vadd_s16(tmp10, tmp5);
+
+ row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+ row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+ row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+ row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+ row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+ row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+ row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+ row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+ } else if (right_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 4, 5, 6, and 7.
+ * Use DC values for these columns.
+ */
+ int16x4_t dcval = vget_high_s16(row0);
+
+ /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x4_t tmp0 = vget_low_s16(row0);
+ int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+ int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+ int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+ int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
+ int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+ int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+ int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+ tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsub_s16(tmp12, tmp13);
+
+ tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsub_s16(tmp10, tmp13);
+ tmp1 = vadd_s16(tmp11, tmp12);
+ tmp2 = vsub_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+ int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+ int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+ int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+ int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
+ int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+ int16x4_t z11 = vadd_s16(tmp4, tmp7);
+ int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+ tmp7 = vadd_s16(z11, z13); /* phase 5 */
+ int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+ tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+ int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+ int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+ z5 = vadd_s16(z5, z10_add_z12);
+ tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+ tmp10 = vadd_s16(tmp10, z12);
+ tmp10 = vsub_s16(tmp10, z5);
+ tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+ tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+ tmp12 = vadd_s16(tmp12, z5);
+
+ tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsub_s16(tmp11, tmp6);
+ tmp4 = vadd_s16(tmp10, tmp5);
+
+ row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+ row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+ row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+ row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+ row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+ row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+ row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+ row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+ } else {
+ /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+ /* Load quantization table. */
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x8_t tmp0 = row0;
+ int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+ int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+ int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */
+ int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+ int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+ int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+ tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsubq_s16(tmp12, tmp13);
+
+ tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsubq_s16(tmp10, tmp13);
+ tmp1 = vaddq_s16(tmp11, tmp12);
+ tmp2 = vsubq_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+ int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+ int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+ int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+ int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */
+ int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+ int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+ int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+ tmp7 = vaddq_s16(z11, z13); /* phase 5 */
+ int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+ tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+ int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+ int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+ z5 = vaddq_s16(z5, z10_add_z12);
+ tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+ tmp10 = vaddq_s16(tmp10, z12);
+ tmp10 = vsubq_s16(tmp10, z5);
+ tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+ tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+ tmp12 = vaddq_s16(tmp12, z5);
+
+ tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsubq_s16(tmp11, tmp6);
+ tmp4 = vaddq_s16(tmp10, tmp5);
+
+ row0 = vaddq_s16(tmp0, tmp7);
+ row7 = vsubq_s16(tmp0, tmp7);
+ row1 = vaddq_s16(tmp1, tmp6);
+ row6 = vsubq_s16(tmp1, tmp6);
+ row2 = vaddq_s16(tmp2, tmp5);
+ row5 = vsubq_s16(tmp2, tmp5);
+ row4 = vaddq_s16(tmp3, tmp4);
+ row3 = vsubq_s16(tmp3, tmp4);
+ }
+
+ /* Transpose rows to work on columns in pass 2. */
+ int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+ int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+ int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+ int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+ int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+ vreinterpretq_s32_s16(rows_45.val[0]));
+ int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+ vreinterpretq_s32_s16(rows_45.val[1]));
+ int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+ vreinterpretq_s32_s16(rows_67.val[0]));
+ int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+ vreinterpretq_s32_s16(rows_67.val[1]));
+
+ int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+ int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+ int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+ int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+ int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+ int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+ int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+ int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+ int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+ int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+ int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+ int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+ /* 1-D IDCT, pass 2 */
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(col0, col4);
+ int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+ int16x8_t tmp13 = vaddq_s16(col2, col6);
+ int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+ int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+ tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+ tmp12 = vsubq_s16(tmp12, tmp13);
+
+ int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+ int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+ int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+ int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+ /* Odd part */
+ int16x8_t z13 = vaddq_s16(col5, col3);
+ int16x8_t neg_z10 = vsubq_s16(col3, col5);
+ int16x8_t z11 = vaddq_s16(col1, col7);
+ int16x8_t z12 = vsubq_s16(col1, col7);
+
+ int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
+ int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+ tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+ tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+ int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+ int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+ z5 = vaddq_s16(z5, z10_add_z12);
+ tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+ tmp10 = vaddq_s16(tmp10, z12);
+ tmp10 = vsubq_s16(tmp10, z5);
+ tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+ tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+ tmp12 = vaddq_s16(tmp12, z5);
+
+ int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
+ int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+ int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+ col0 = vaddq_s16(tmp0, tmp7);
+ col7 = vsubq_s16(tmp0, tmp7);
+ col1 = vaddq_s16(tmp1, tmp6);
+ col6 = vsubq_s16(tmp1, tmp6);
+ col2 = vaddq_s16(tmp2, tmp5);
+ col5 = vsubq_s16(tmp2, tmp5);
+ col4 = vaddq_s16(tmp3, tmp4);
+ col3 = vsubq_s16(tmp3, tmp4);
+
+ /* Scale down by a factor of 8, narrowing to 8-bit. */
+ int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+ vqshrn_n_s16(col1, PASS1_BITS + 3));
+ int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+ vqshrn_n_s16(col5, PASS1_BITS + 3));
+ int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+ vqshrn_n_s16(col3, PASS1_BITS + 3));
+ int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+ vqshrn_n_s16(col7, PASS1_BITS + 3));
+ /* Clamp to range [0-255]. */
+ uint8x16_t cols_01 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_45 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_23 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_67 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+ /* Transpose block to prepare for store. */
+ uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+ vreinterpretq_u32_u8(cols_45));
+ uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+ vreinterpretq_u32_u8(cols_67));
+
+ uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+ vreinterpretq_u8_u32(cols_0415.val[1]));
+ uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+ vreinterpretq_u8_u32(cols_2637.val[1]));
+ uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+ vreinterpretq_u16_u8(cols_2367.val[0]));
+ uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+ vreinterpretq_u16_u8(cols_2367.val[1]));
+
+ uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+ uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+ uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+ uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+ JSAMPROW outptr0 = output_buf[0] + output_col;
+ JSAMPROW outptr1 = output_buf[1] + output_col;
+ JSAMPROW outptr2 = output_buf[2] + output_col;
+ JSAMPROW outptr3 = output_buf[3] + output_col;
+ JSAMPROW outptr4 = output_buf[4] + output_col;
+ JSAMPROW outptr5 = output_buf[5] + output_col;
+ JSAMPROW outptr6 = output_buf[6] + output_col;
+ JSAMPROW outptr7 = output_buf[7] + output_col;
+
+ /* Store DCT block to memory. */
+ vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+ vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+ vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+ vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+ vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+ vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+ vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+ vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
diff --git a/media/libjpeg/simd/arm/jidctint-neon.c b/media/libjpeg/simd/arm/jidctint-neon.c
new file mode 100644
index 0000000000..043b652e6c
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctint-neon.c
@@ -0,0 +1,802 @@
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time. Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ * 0.298631336 = 2446 * 2^-13
+ * 0.390180644 = 3196 * 2^-13
+ * 0.541196100 = 4433 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.175875602 = 9633 * 2^-13
+ * 1.501321110 = 12299 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 1.961570560 = 16069 * 2^-13
+ * 2.053119869 = 16819 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ * 3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298 2446
+#define F_0_390 3196
+#define F_0_541 4433
+#define F_0_765 6270
+#define F_0_899 7373
+#define F_1_175 9633
+#define F_1_501 12299
+#define F_1_847 15137
+#define F_1_961 16069
+#define F_2_053 16819
+#define F_2_562 20995
+#define F_3_072 25172
+
+#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)
+
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+ F_0_899, F_0_541,
+ F_2_562, F_0_298_MINUS_0_899,
+ F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+ F_0_541_PLUS_0_765, F_1_175,
+ F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+ F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+ 0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t row4,
+ int16x4_t row5,
+ int16x4_t row6,
+ int16x4_t row7,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16x4_t quant_row4,
+ int16x4_t quant_row5,
+ int16x4_t quant_row6,
+ int16x4_t quant_row7,
+ int16_t *workspace_1,
+ int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16_t *workspace_1,
+ int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset);
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients. For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ *
+ * Optimization techniques used for fast data access:
+ *
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block. This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients. Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.) Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
+ *
+ * +-------+-------+ +-------+-------+
+ * | | | | | |
+ * | 0 | 1 | | 0 | 2 |
+ * | | | transpose | | |
+ * +-------+-------+ ------> +-------+-------+
+ * | | | | | |
+ * | 2 | 3 | | 1 | 3 |
+ * | | | | | |
+ * +-------+-------+ +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * as you move diagonally from top left to bottom right. If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified. On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
+ *
+ * 1) Coefficients in rows 4-7 are all zero. In this case, we perform a
+ * "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero. In this case, the inverse DCT
+ * result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero. In this case, the inverse DCT
+ * result is all zero. For the left 4x8 half, this is handled identically
+ * to Case 2 above. For the right 4x8 half, we do no work and signal that
+ * the "sparse" algorithm is required for the second pass.
+ *
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.) If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block. (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ int16_t workspace_l[8 * DCTSIZE / 2];
+ int16_t workspace_r[8 * DCTSIZE / 2];
+
+ /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+ /* Load DCT coefficients in left 4x8 block. */
+ int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+ int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+ int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+ int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+ int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+ int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+ int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+ int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table for left 4x8 block. */
+ int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+ int16x4_t bitmap = vorr_s16(row7, row6);
+ bitmap = vorr_s16(bitmap, row5);
+ bitmap = vorr_s16(bitmap, row4);
+ int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (bitmap_rows_4567 == 0) {
+ bitmap = vorr_s16(bitmap, row3);
+ bitmap = vorr_s16(bitmap, row2);
+ bitmap = vorr_s16(bitmap, row1);
+ int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (left_ac_bitmap == 0) {
+ int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+ int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+ /* Store 4x4 blocks to workspace, transposing in the process. */
+ vst4_s16(workspace_l, quadrant);
+ vst4_s16(workspace_r, quadrant);
+ } else {
+ jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+ quant_row1, quant_row2, quant_row3,
+ workspace_l, workspace_r);
+ }
+ } else {
+ jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+ row6, row7, quant_row0, quant_row1,
+ quant_row2, quant_row3, quant_row4,
+ quant_row5, quant_row6, quant_row7,
+ workspace_l, workspace_r);
+ }
+
+ /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+ /* Load DCT coefficients in right 4x8 block. */
+ row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+ row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+ row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+ row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+ row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+ row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+ row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+ row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+ /* Load quantization table for right 4x8 block. */
+ quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+ quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+ quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+ bitmap = vorr_s16(row7, row6);
+ bitmap = vorr_s16(bitmap, row5);
+ bitmap = vorr_s16(bitmap, row4);
+ bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+ bitmap = vorr_s16(bitmap, row3);
+ bitmap = vorr_s16(bitmap, row2);
+ bitmap = vorr_s16(bitmap, row1);
+ int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ /* If this remains non-zero, a "regular" second pass will be performed. */
+ int64_t right_ac_dc_bitmap = 1;
+
+ if (right_ac_bitmap == 0) {
+ bitmap = vorr_s16(bitmap, row0);
+ right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (right_ac_dc_bitmap != 0) {
+ int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+ int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+ /* Store 4x4 blocks to workspace, transposing in the process. */
+ vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+ vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+ }
+ } else {
+ if (bitmap_rows_4567 == 0) {
+ jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+ quant_row1, quant_row2, quant_row3,
+ workspace_l + 4 * DCTSIZE / 2,
+ workspace_r + 4 * DCTSIZE / 2);
+ } else {
+ jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+ row6, row7, quant_row0, quant_row1,
+ quant_row2, quant_row3, quant_row4,
+ quant_row5, quant_row6, quant_row7,
+ workspace_l + 4 * DCTSIZE / 2,
+ workspace_r + 4 * DCTSIZE / 2);
+ }
+ }
+
+ /* Second pass: compute IDCT on rows in workspace. */
+
+ /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
+ if (right_ac_dc_bitmap == 0) {
+ jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+ jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+ } else {
+ jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+ jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+ }
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients. (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
+ *
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c. Algorithmic changes made here are documented inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t row4,
+ int16x4_t row5,
+ int16x4_t row6,
+ int16x4_t row7,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16x4_t quant_row4,
+ int16x4_t quant_row5,
+ int16x4_t quant_row6,
+ int16x4_t quant_row7,
+ int16_t *workspace_1,
+ int16_t *workspace_2)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part */
+ int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+ int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+ tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+ z2_s16 = vmul_s16(row0, quant_row0);
+ z3_s16 = vmul_s16(row4, quant_row4);
+
+ int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part */
+ int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+ int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+ int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+ int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+ z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+ int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z5 = (z3 + z4) * 1.175875602;
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ * z3 += z5; z4 += z5;
+ *
+ * This implementation:
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+ */
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ * tmp0 += z1 + z3; tmp1 += z2 + z4;
+ * tmp2 += z2 + z3; tmp3 += z1 + z4;
+ *
+ * This implementation:
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ * tmp0 += z3; tmp1 += z4;
+ * tmp2 += z3; tmp3 += z4;
+ */
+
+ tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+ tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+ tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+ tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+ tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+ tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+ tmp0 = vaddq_s32(tmp0, z3);
+ tmp1 = vaddq_s32(tmp1, z4);
+ tmp2 = vaddq_s32(tmp2, z3);
+ tmp3 = vaddq_s32(tmp3, z4);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x4x4_t rows_0123 = { {
+ vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ } };
+ int16x4x4_t rows_4567 = { {
+ vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ } };
+
+ /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+ * (VST4 transposes the blocks. We need to operate on rows in the next
+ * pass.)
+ */
+ vst4_s16(workspace_1, rows_0123);
+ vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
+ *
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0. This simplifies the IDCT calculation, accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16_t *workspace_1,
+ int16_t *workspace_2)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part (z3 is all 0) */
+ int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+ z2_s16 = vmul_s16(row0, quant_row0);
+ int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part (tmp0 and tmp1 are both all 0) */
+ int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+ int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+ int16x4_t z3_s16 = tmp2_s16;
+ int16x4_t z4_s16 = tmp3_s16;
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+ tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x4x4_t rows_0123 = { {
+ vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ } };
+ int16x4x4_t rows_4567 = { {
+ vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ } };
+
+ /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+ * (VST4 transposes the blocks. We need to operate on rows in the next
+ * pass.)
+ */
+ vst4_s16(workspace_1, rows_0123);
+ vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients. (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c. Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part */
+ int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+ int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+ tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+ z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+ z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+ int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part */
+ int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+ int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+ int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+ int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+ z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+ int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z5 = (z3 + z4) * 1.175875602;
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ * z3 += z5; z4 += z5;
+ *
+ * This implementation:
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+ */
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ * tmp0 += z1 + z3; tmp1 += z2 + z4;
+ * tmp2 += z2 + z3; tmp3 += z1 + z4;
+ *
+ * This implementation:
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ * tmp0 += z3; tmp1 += z4;
+ * tmp2 += z3; tmp3 += z4;
+ */
+
+ tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+ tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+ tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+ tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+ tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+ tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+ tmp0 = vaddq_s32(tmp0, z3);
+ tmp1 = vaddq_s32(tmp1, z4);
+ tmp2 = vaddq_s32(tmp2, z3);
+ tmp3 = vaddq_s32(tmp3, z4);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+ vaddhn_s32(tmp12, tmp1));
+ int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+ vaddhn_s32(tmp13, tmp0));
+ int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+ vsubhn_s32(tmp11, tmp2));
+ int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+ vsubhn_s32(tmp10, tmp3));
+ /* Descale and narrow to 8-bit. */
+ int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+ int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+ int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+ int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+ /* Clamp to range [0-255]. */
+ uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+
+ /* Transpose 4x8 block and store to memory. (Zipping adjacent columns
+ * together allows us to store 16-bit elements.)
+ */
+ uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+ uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+ uint16x4x4_t cols_01_23_45_67 = { {
+ vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ } };
+
+ JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+ JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+ JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+ JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+ /* VST4 of 16-bit elements completes the transpose. */
+ vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0. This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset)
+{
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part (z3 is all 0) */
+ int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+ z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+ int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part (tmp0 and tmp1 are both all 0) */
+ int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+ int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+ int16x4_t z3_s16 = tmp2_s16;
+ int16x4_t z4_s16 = tmp3_s16;
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+ tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+ vaddhn_s32(tmp12, tmp1));
+ int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+ vaddhn_s32(tmp13, tmp0));
+ int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+ vsubhn_s32(tmp11, tmp2));
+ int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+ vsubhn_s32(tmp10, tmp3));
+ /* Descale and narrow to 8-bit. */
+ int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+ int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+ int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+ int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+ /* Clamp to range [0-255]. */
+ uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+
+ /* Transpose 4x8 block and store to memory. (Zipping adjacent columns
+ * together allows us to store 16-bit elements.)
+ */
+ uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+ uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+ uint16x4x4_t cols_01_23_45_67 = { {
+ vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ } };
+
+ JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+ JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+ JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+ JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+ /* VST4 of 16-bit elements completes the transpose. */
+ vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
diff --git a/media/libjpeg/simd/arm/jidctred-neon.c b/media/libjpeg/simd/arm/jidctred-neon.c
new file mode 100644
index 0000000000..be9627e61d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctred-neon.c
@@ -0,0 +1,486 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define F_0_211 1730
+#define F_0_509 4176
+#define F_0_601 4926
+#define F_0_720 5906
+#define F_0_765 6270
+#define F_0_850 6967
+#define F_0_899 7373
+#define F_1_061 8697
+#define F_1_272 10426
+#define F_1_451 11893
+#define F_1_847 15137
+#define F_2_172 17799
+#define F_2_562 20995
+#define F_3_624 29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.720959822 = 5906 * 2^-13
+ * 0.850430095 = 6967 * 2^-13
+ * 1.272758580 = 10426 * 2^-13
+ * 3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm. Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+ -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Dequantize DCT coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+ row1 = vmulq_s16(row1, quant_row1);
+ row3 = vmulq_s16(row3, quant_row3);
+ row5 = vmulq_s16(row5, quant_row5);
+ row7 = vmulq_s16(row7, quant_row7);
+
+ /* Load IDCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+ /* Pass 1: process columns from input, put results in vectors row0 and
+ * row1.
+ */
+
+ /* Even part */
+ int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+ int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+ /* Odd part */
+ int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+ int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+ vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+ row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+ vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+ /* Transpose two rows, ready for second pass. */
+ int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+ int16x8_t cols_0246 = cols_0246_1357.val[0];
+ int16x8_t cols_1357 = cols_0246_1357.val[1];
+ /* Duplicate columns such that each is accessible in its own vector. */
+ int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+ vreinterpretq_s32_s16(cols_1357));
+ int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+ int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+ /* Pass 2: process two rows, store to output array. */
+
+ /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+ * care."
+ */
+ int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+ /* Odd part: we're only interested in the bottom half of tmp0. */
+ int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+ tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+ tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+ tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+ /* Final output stage: descale and clamp to range [0-255]. */
+ int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+ vsubhn_s32(tmp10, tmp0));
+ output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+ CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+ /* Narrow to 8-bit and convert to unsigned. */
+ uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+ /* Store 2x2 block to memory. */
+ vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+ vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+ vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+ vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.211164243 = 1730 * 2^-13
+ * 0.509795579 = 4176 * 2^-13
+ * 0.601344887 = 4926 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.061594337 = 8697 * 2^-13
+ * 1.451774981 = 11893 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 2.172734803 = 17799 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm. Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+ F_1_847, -F_0_765, -F_0_211, F_1_451,
+ -F_2_172, F_1_061, -F_0_509, -F_0_601,
+ F_0_899, F_2_562, 0, 0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values for DC coefficients. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ /* Dequantize DC coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+
+ /* Construct bitmap to test if all AC coefficients are 0. */
+ int16x8_t bitmap = vorrq_s16(row1, row2);
+ bitmap = vorrq_s16(bitmap, row3);
+ bitmap = vorrq_s16(bitmap, row5);
+ bitmap = vorrq_s16(bitmap, row6);
+ bitmap = vorrq_s16(bitmap, row7);
+
+ int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+ int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+ /* All AC coefficients are zero.
+ * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+ */
+ int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+ row0 = dcval;
+ row1 = dcval;
+ row2 = dcval;
+ row3 = dcval;
+ } else if (left_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 0, 1, 2, and 3.
+ * Compute DC values for these columns.
+ */
+ int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+ /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Even part */
+ int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+ int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+ int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part */
+ int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+ z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+ z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+ int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+ tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1));
+ row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1));
+ row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1));
+ row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1));
+ } else if (right_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 4, 5, 6, and 7.
+ * Compute DC values for these columns.
+ */
+ int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+ /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part */
+ int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+ int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+ int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part */
+ int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+ z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+ z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+ int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+ tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ } else {
+ /* All AC coefficients are non-zero; full IDCT calculation required. */
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part */
+ int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+ int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+ int16x8_t z2 = vmulq_s16(row2, quant_row2);
+ int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+ int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+ int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+ int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+ int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+ int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+ int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+ /* Odd part */
+ int16x8_t z1 = vmulq_s16(row7, quant_row7);
+ z2 = vmulq_s16(row5, quant_row5);
+ z3 = vmulq_s16(row3, quant_row3);
+ int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+ tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+ tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+ tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+ tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+ CONST_BITS - PASS1_BITS + 1));
+ }
+
+ /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+ int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+ int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+ int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+ vreinterpretq_s32_s16(row_23.val[0]));
+ int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+ vreinterpretq_s32_s16(row_23.val[1]));
+
+ int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+ int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+ int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+ int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+ int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+ int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+ int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+ /* Commence second pass of IDCT. */
+
+ /* Even part */
+ int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+ int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part */
+ tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+ /* Final output stage: descale and clamp to range [0-255]. */
+ int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+ vsubhn_s32(tmp12, tmp0));
+ int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+ vsubhn_s32(tmp10, tmp2));
+ output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+ CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+ output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+ CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+ /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+ * An interleaving store completes the transpose.
+ */
+ uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+ vqmovun_s16(output_cols_13));
+ uint16x4x2_t output_01_23 = { {
+ vreinterpret_u16_u8(output_0123.val[0]),
+ vreinterpret_u16_u8(output_0123.val[1])
+ } };
+
+ /* Store 4x4 block to memory. */
+ JSAMPROW outptr0 = output_buf[0] + output_col;
+ JSAMPROW outptr1 = output_buf[1] + output_col;
+ JSAMPROW outptr2 = output_buf[2] + output_col;
+ JSAMPROW outptr3 = output_buf[3] + output_col;
+ vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+ vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+ vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+ vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
diff --git a/media/libjpeg/simd/arm/jquanti-neon.c b/media/libjpeg/simd/arm/jquanti-neon.c
new file mode 100644
index 0000000000..d5d95d89f6
--- /dev/null
+++ b/media/libjpeg/simd/arm/jquanti-neon.c
@@ -0,0 +1,193 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+ uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+ uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+ uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+ uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+ uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+ uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+ uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+ int16x8_t row0 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row1 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row2 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row3 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row4 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row5 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row6 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row7 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+ vst1q_s16(workspace + 0 * DCTSIZE, row0);
+ vst1q_s16(workspace + 1 * DCTSIZE, row1);
+ vst1q_s16(workspace + 2 * DCTSIZE, row2);
+ vst1q_s16(workspace + 3 * DCTSIZE, row3);
+ vst1q_s16(workspace + 4 * DCTSIZE, row4);
+ vst1q_s16(workspace + 5 * DCTSIZE, row5);
+ vst1q_s16(workspace + 6 * DCTSIZE, row6);
+ vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace)
+{
+ JCOEFPTR out_ptr = coef_block;
+ UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+ UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+ DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+ int i;
+
+#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
+#pragma unroll
+#endif
+ for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+ /* Load reciprocals of quantization values. */
+ uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+ uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+ uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+ uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+ uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+ uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+ uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+ uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+ int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+ int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+ int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+ int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+ /* Extract sign from coefficients. */
+ int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+ int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+ int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+ int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+ /* Get absolute value of DCT coefficients. */
+ uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+ uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+ uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+ uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+ /* Add correction. */
+ abs_row0 = vaddq_u16(abs_row0, corr0);
+ abs_row1 = vaddq_u16(abs_row1, corr1);
+ abs_row2 = vaddq_u16(abs_row2, corr2);
+ abs_row3 = vaddq_u16(abs_row3, corr3);
+
+ /* Multiply DCT coefficients by quantization reciprocals. */
+ int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+ vget_low_u16(recip0)));
+ int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+ vget_high_u16(recip0)));
+ int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+ vget_low_u16(recip1)));
+ int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+ vget_high_u16(recip1)));
+ int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+ vget_low_u16(recip2)));
+ int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+ vget_high_u16(recip2)));
+ int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+ vget_low_u16(recip3)));
+ int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+ vget_high_u16(recip3)));
+ /* Narrow back to 16-bit. */
+ row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+ row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+ row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+ row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+ /* Since VSHR only supports an immediate as its second argument, negate the
+ * shift value and shift left.
+ */
+ row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+ vnegq_s16(shift0)));
+ row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+ vnegq_s16(shift1)));
+ row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+ vnegq_s16(shift2)));
+ row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+ vnegq_s16(shift3)));
+
+ /* Restore sign to original product. */
+ row0 = veorq_s16(row0, sign_row0);
+ row0 = vsubq_s16(row0, sign_row0);
+ row1 = veorq_s16(row1, sign_row1);
+ row1 = vsubq_s16(row1, sign_row1);
+ row2 = veorq_s16(row2, sign_row2);
+ row2 = vsubq_s16(row2, sign_row2);
+ row3 = veorq_s16(row3, sign_row3);
+ row3 = vsubq_s16(row3, sign_row3);
+
+ /* Store quantized coefficients to memory. */
+ vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+ vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+ vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+ vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+ }
+}
diff --git a/media/libjpeg/simd/arm/neon-compat.h b/media/libjpeg/simd/arm/neon-compat.h
new file mode 100644
index 0000000000..2907634e26
--- /dev/null
+++ b/media/libjpeg/simd/arm/neon-compat.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
+#define BUILTIN_CLZLL(x) _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP64(x) _byteswap_uint64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x) __builtin_clz(x)
+#define BUILTIN_CLZLL(x) __builtin_clzll(x)
+#define BUILTIN_BSWAP64(x) __builtin_bswap64(x)
+#else
+#error "Unknown compiler"
+#endif