summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vpx_dsp/arm
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vpx_dsp/arm')
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c237
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c65
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c480
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c439
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h318
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c419
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h2919
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c85
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h105
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c143
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h307
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h542
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c168
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c158
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c140
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c64
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c215
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c1361
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c640
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c757
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c625
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c88
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c89
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c371
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h474
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c2514
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c776
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c305
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c273
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c408
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c586
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c509
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c931
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c183
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c113
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c58
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c77
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c764
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c674
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c58
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c513
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c776
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm66
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c47
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm188
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c59
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c65
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c59
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm46
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h919
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c1942
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm630
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm666
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm549
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm491
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c1107
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h443
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c290
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c344
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c570
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm34
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c490
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c137
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h223
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c100
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h1546
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c552
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm438
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm439
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm486
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm487
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm415
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm415
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c2110
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h261
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c41
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h29
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm457
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm455
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c139
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm116
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c100
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm84
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c65
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c320
85 files changed, 39147 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c
new file mode 100644
index 0000000000..8c61fc26f4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) {
+ const uint8x16_t b = load_unaligned_u8q(a, a_stride);
+ const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
+ return (horizontal_add_uint16x8(c) + (1 << 3)) >> 4;
+}
+
+uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
+ int i;
+ uint8x8_t b, c;
+ uint16x8_t sum;
+ b = vld1_u8(a);
+ a += a_stride;
+ c = vld1_u8(a);
+ a += a_stride;
+ sum = vaddl_u8(b, c);
+
+ for (i = 0; i < 6; ++i) {
+ const uint8x8_t d = vld1_u8(a);
+ a += a_stride;
+ sum = vaddw_u8(sum, d);
+ }
+
+ return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
+}
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+int vpx_satd_neon(const tran_low_t *coeff, int length) {
+ int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ do {
+ int16x8_t abs0, abs1;
+ const int16x8_t s0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t s1 = load_tran_low_to_s16q(coeff + 8);
+
+ abs0 = vabsq_s16(s0);
+ sum_s32[0] = vpadalq_s16(sum_s32[0], abs0);
+ abs1 = vabsq_s16(s1);
+ sum_s32[1] = vpadalq_s16(sum_s32[1], abs1);
+
+ length -= 16;
+ coeff += 16;
+ } while (length != 0);
+
+ return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1]));
+}
+
+void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+ const int ref_stride, const int height) {
+ int i;
+ uint16x8_t vec_sum_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_hi = vdupq_n_u16(0);
+ const int shift_factor = ((height >> 5) + 3) * -1;
+ const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
+
+ for (i = 0; i < height; i += 8) {
+ const uint8x16_t vec_row1 = vld1q_u8(ref);
+ const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
+ const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
+ const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
+ const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
+ const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
+ const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
+ const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+
+ vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
+ vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
+
+ ref += ref_stride * 8;
+ }
+
+ vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
+ vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+
+ vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
+ hbuf += 8;
+ vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+}
+
+int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+ int i;
+ uint16x8_t vec_sum = vdupq_n_u16(0);
+
+ for (i = 0; i < width; i += 16) {
+ const uint8x16_t vec_row = vld1q_u8(ref);
+ vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
+ vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
+ ref += 16;
+ }
+
+ return (int16_t)horizontal_add_uint16x8(vec_sum);
+}
+
+// ref, src = [0, 510] - max diff = 16-bits
+// bwl = {2, 3, 4}, width = {16, 32, 64}
+int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
+ int width = 4 << bwl;
+ int32x4_t sse = vdupq_n_s32(0);
+ int16x8_t total = vdupq_n_s16(0);
+
+ assert(width >= 8);
+ assert((width % 8) == 0);
+
+ do {
+ const int16x8_t r = vld1q_s16(ref);
+ const int16x8_t s = vld1q_s16(src);
+ const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits.
+ const int16x4_t diff_lo = vget_low_s16(diff);
+ const int16x4_t diff_hi = vget_high_s16(diff);
+ sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits.
+ sse = vmlal_s16(sse, diff_hi, diff_hi);
+ total = vaddq_s16(total, diff); // dynamic range 16 bits.
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+ } while (width != 0);
+
+ {
+ // Note: 'total''s pairwise addition could be implemented similarly to
+ // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired
+ // with the summation of 'sse' performed better on a Cortex-A15.
+ const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total'
+ const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+ const int32x2_t t2 = vpadd_s32(t1, t1);
+ const int t = vget_lane_s32(t2, 0);
+ const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'.
+ const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+ vreinterpret_s32_s64(vget_high_s64(s0)));
+ const int s = vget_lane_s32(s1, 0);
+ const int shift_factor = bwl + 2;
+ return s - ((t * t) >> shift_factor);
+ }
+}
+
+void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int *min, int *max) {
+ // Load and concatenate.
+ const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
+ const uint8x16_t a23 =
+ vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
+ const uint8x16_t a45 =
+ vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
+ const uint8x16_t a67 =
+ vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
+
+ const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
+ const uint8x16_t b23 =
+ vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
+ const uint8x16_t b45 =
+ vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
+ const uint8x16_t b67 =
+ vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
+
+ // Absolute difference.
+ const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+ const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+ const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+ const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+ // Max values between the Q vectors.
+ const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+ const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+ const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+ const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+#if VPX_ARCH_AARCH64
+ *min = *max = 0; // Clear high bits
+ *((uint8_t *)max) = vmaxvq_u8(ab07_max);
+ *((uint8_t *)min) = vminvq_u8(ab07_min);
+#else
+ // Split into 64-bit vectors and execute pairwise min/max.
+ uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+ uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+ // Enough runs of vpmax/min propagate the max/min values to every position.
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u8((uint8_t *)max, ab_max, 0);
+ vst1_lane_u8((uint8_t *)min, ab_min, 0);
+#endif
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c
new file mode 100644
index 0000000000..5afdece0ab
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ if (width > 8) {
+ int x, y = height;
+ do {
+ for (x = 0; x < width; x += 16) {
+ const uint8x16_t p = vld1q_u8(pred + x);
+ const uint8x16_t r = vld1q_u8(ref + x);
+ const uint8x16_t avg = vrhaddq_u8(p, r);
+ vst1q_u8(comp + x, avg);
+ }
+ comp += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--y);
+ } else if (width == 8) {
+ int i = width * height;
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ uint8x16_t r;
+ const uint8x8_t r_0 = vld1_u8(ref);
+ const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+ r = vcombine_u8(r_0, r_1);
+ ref += 2 * ref_stride;
+ r = vrhaddq_u8(r, p);
+ vst1q_u8(comp, r);
+
+ pred += 16;
+ comp += 16;
+ i -= 16;
+ } while (i);
+ } else {
+ int i = width * height;
+ assert(width == 4);
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ uint8x16_t r;
+
+ r = load_unaligned_u8q(ref, ref_stride);
+ ref += 4 * ref_stride;
+ r = vrhaddq_u8(r, p);
+ vst1q_u8(comp, r);
+
+ pred += 16;
+ comp += 16;
+ i -= 16;
+ } while (i);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c
new file mode 100644
index 0000000000..7efce32735
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+extern const int16_t vpx_rv[];
+
+static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1,
+ const uint8x8_t v0, const uint8x8_t b1,
+ const uint8x8_t b2) {
+ const uint8x8_t k1 = vrhadd_u8(a2, a1);
+ const uint8x8_t k2 = vrhadd_u8(b2, b1);
+ const uint8x8_t k3 = vrhadd_u8(k1, k2);
+ return vrhadd_u8(k3, v0);
+}
+
+static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1,
+ const uint8x8_t v0, const uint8x8_t b1,
+ const uint8x8_t b2, const uint8x8_t filter) {
+ const uint8x8_t a2_v0 = vabd_u8(a2, v0);
+ const uint8x8_t a1_v0 = vabd_u8(a1, v0);
+ const uint8x8_t b1_v0 = vabd_u8(b1, v0);
+ const uint8x8_t b2_v0 = vabd_u8(b2, v0);
+
+ uint8x8_t max = vmax_u8(a2_v0, a1_v0);
+ max = vmax_u8(b1_v0, max);
+ max = vmax_u8(b2_v0, max);
+ return vclt_u8(max, filter);
+}
+
+static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1,
+ const uint8x8_t v0, const uint8x8_t b1,
+ const uint8x8_t b2, const uint8x8_t filter) {
+ const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2);
+ const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter);
+
+ return vbsl_u8(mask, k_out, v0);
+}
+
+// Same functions but for uint8x16_t.
+static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1,
+ const uint8x16_t v0, const uint8x16_t b1,
+ const uint8x16_t b2) {
+ const uint8x16_t k1 = vrhaddq_u8(a2, a1);
+ const uint8x16_t k2 = vrhaddq_u8(b2, b1);
+ const uint8x16_t k3 = vrhaddq_u8(k1, k2);
+ return vrhaddq_u8(k3, v0);
+}
+
+static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1,
+ const uint8x16_t v0, const uint8x16_t b1,
+ const uint8x16_t b2, const uint8x16_t filter) {
+ const uint8x16_t a2_v0 = vabdq_u8(a2, v0);
+ const uint8x16_t a1_v0 = vabdq_u8(a1, v0);
+ const uint8x16_t b1_v0 = vabdq_u8(b1, v0);
+ const uint8x16_t b2_v0 = vabdq_u8(b2, v0);
+
+ uint8x16_t max = vmaxq_u8(a2_v0, a1_v0);
+ max = vmaxq_u8(b1_v0, max);
+ max = vmaxq_u8(b2_v0, max);
+ return vcltq_u8(max, filter);
+}
+
+static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1,
+ const uint8x16_t v0, const uint8x16_t b1,
+ const uint8x16_t b2,
+ const uint8x16_t filter) {
+ const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2);
+ const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter);
+
+ return vbslq_u8(mask, k_out, v0);
+}
+
+void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int dst_stride, int cols,
+ uint8_t *f, int size) {
+ uint8_t *src, *dst;
+ int row;
+ int col;
+
+ // While columns of length 16 can be processed, load them.
+ for (col = 0; col < cols - 8; col += 16) {
+ uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;
+ src = src_ptr - 2 * src_stride;
+ dst = dst_ptr;
+
+ a0 = vld1q_u8(src);
+ src += src_stride;
+ a1 = vld1q_u8(src);
+ src += src_stride;
+ a2 = vld1q_u8(src);
+ src += src_stride;
+ a3 = vld1q_u8(src);
+ src += src_stride;
+
+ for (row = 0; row < size; row += 4) {
+ uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3;
+ const uint8x16_t filterq = vld1q_u8(f + col);
+
+ a4 = vld1q_u8(src);
+ src += src_stride;
+ a5 = vld1q_u8(src);
+ src += src_stride;
+ a6 = vld1q_u8(src);
+ src += src_stride;
+ a7 = vld1q_u8(src);
+ src += src_stride;
+
+ v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq);
+ v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq);
+ v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq);
+ v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq);
+
+ vst1q_u8(dst, v_out_0);
+ dst += dst_stride;
+ vst1q_u8(dst, v_out_1);
+ dst += dst_stride;
+ vst1q_u8(dst, v_out_2);
+ dst += dst_stride;
+ vst1q_u8(dst, v_out_3);
+ dst += dst_stride;
+
+ // Rotate over to the next slot.
+ a0 = a4;
+ a1 = a5;
+ a2 = a6;
+ a3 = a7;
+ }
+
+ src_ptr += 16;
+ dst_ptr += 16;
+ }
+
+ // Clean up any left over column of length 8.
+ if (col != cols) {
+ uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
+ src = src_ptr - 2 * src_stride;
+ dst = dst_ptr;
+
+ a0 = vld1_u8(src);
+ src += src_stride;
+ a1 = vld1_u8(src);
+ src += src_stride;
+ a2 = vld1_u8(src);
+ src += src_stride;
+ a3 = vld1_u8(src);
+ src += src_stride;
+
+ for (row = 0; row < size; row += 4) {
+ uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3;
+ const uint8x8_t filter = vld1_u8(f + col);
+
+ a4 = vld1_u8(src);
+ src += src_stride;
+ a5 = vld1_u8(src);
+ src += src_stride;
+ a6 = vld1_u8(src);
+ src += src_stride;
+ a7 = vld1_u8(src);
+ src += src_stride;
+
+ v_out_0 = generate_output(a0, a1, a2, a3, a4, filter);
+ v_out_1 = generate_output(a1, a2, a3, a4, a5, filter);
+ v_out_2 = generate_output(a2, a3, a4, a5, a6, filter);
+ v_out_3 = generate_output(a3, a4, a5, a6, a7, filter);
+
+ vst1_u8(dst, v_out_0);
+ dst += dst_stride;
+ vst1_u8(dst, v_out_1);
+ dst += dst_stride;
+ vst1_u8(dst, v_out_2);
+ dst += dst_stride;
+ vst1_u8(dst, v_out_3);
+ dst += dst_stride;
+
+ // Rotate over to the next slot.
+ a0 = a4;
+ a1 = a5;
+ a2 = a6;
+ a3 = a7;
+ }
+
+ // Not strictly necessary but makes resetting dst_ptr easier.
+ dst_ptr += 8;
+ }
+
+ dst_ptr -= cols;
+
+ for (row = 0; row < size; row += 8) {
+ uint8x8_t a0, a1, a2, a3;
+ uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+ src = dst_ptr;
+ dst = dst_ptr;
+
+ // Load 8 values, transpose 4 of them, and discard 2 because they will be
+ // reloaded later.
+ load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3);
+ a3 = a1;
+ a2 = a1 = a0; // Extend left border.
+
+ src += 2;
+
+ for (col = 0; col < cols; col += 8) {
+ uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6,
+ v_out_7;
+ // Although the filter is meant to be applied vertically and is instead
+ // being applied horizontally here it's OK because it's set in blocks of 8
+ // (or 16).
+ const uint8x8_t filter = vld1_u8(f + col);
+
+ load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5,
+ &b6, &b7);
+
+ if (col + 8 == cols) {
+ // Last row. Extend border (b5).
+ b6 = b7 = b5;
+ }
+
+ v_out_0 = generate_output(a0, a1, a2, a3, b0, filter);
+ v_out_1 = generate_output(a1, a2, a3, b0, b1, filter);
+ v_out_2 = generate_output(a2, a3, b0, b1, b2, filter);
+ v_out_3 = generate_output(a3, b0, b1, b2, b3, filter);
+ v_out_4 = generate_output(b0, b1, b2, b3, b4, filter);
+ v_out_5 = generate_output(b1, b2, b3, b4, b5, filter);
+ v_out_6 = generate_output(b2, b3, b4, b5, b6, filter);
+ v_out_7 = generate_output(b3, b4, b5, b6, b7, filter);
+
+ transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2,
+ v_out_3, v_out_4, v_out_5, v_out_6, v_out_7);
+
+ a0 = b4;
+ a1 = b5;
+ a2 = b6;
+ a3 = b7;
+
+ src += 8;
+ dst += 8;
+ }
+
+ dst_ptr += 8 * dst_stride;
+ }
+}
+
+// sum += x;
+// sumsq += x * y;
+static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy,
+ int16x4_t *const sum, int32x4_t *const sumsq) {
+ const int16x4_t zero = vdup_n_s16(0);
+ const int32x4_t zeroq = vdupq_n_s32(0);
+
+ // Add in the first set because vext doesn't work with '0'.
+ *sum = vadd_s16(*sum, x);
+ *sumsq = vaddq_s32(*sumsq, xy);
+
+ // Shift x and xy to the right and sum. vext requires an immediate.
+ *sum = vadd_s16(*sum, vext_s16(zero, x, 1));
+ *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1));
+
+ *sum = vadd_s16(*sum, vext_s16(zero, x, 2));
+ *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2));
+
+ *sum = vadd_s16(*sum, vext_s16(zero, x, 3));
+ *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3));
+}
+
+// Generate mask based on (sumsq * 15 - sum * sum < flimit)
+static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq,
+ const int32x4_t f, const int32x4_t fifteen) {
+ const int32x4_t a = vmulq_s32(sumsq, fifteen);
+ const int32x4_t b = vmlsl_s16(a, sum, sum);
+ const uint32x4_t mask32 = vcltq_s32(b, f);
+ return vmovn_u32(mask32);
+}
+
+static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high,
+ const int32x4_t sumsq_low,
+ const int32x4_t sumsq_high, const int32x4_t f) {
+ const int32x4_t fifteen = vdupq_n_s32(15);
+ const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen);
+ const uint16x4_t mask16_high =
+ calculate_mask(sum_high, sumsq_high, f, fifteen);
+ return vmovn_u16(vcombine_u16(mask16_low, mask16_high));
+}
+
+// Apply filter of (8 + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) {
+ const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+ const int16x8_t sum_s = vaddq_s16(sum, s16);
+
+ return vqrshrun_n_s16(sum_s, 4);
+}
+
+void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,
+ int flimit) {
+ int row, col;
+ const int32x4_t f = vdupq_n_s32(flimit);
+
+ assert(cols % 8 == 0);
+
+ for (row = 0; row < rows; ++row) {
+ // Sum the first 8 elements, which are extended from s[0].
+ // sumsq gets primed with +16.
+ int sumsq = src[0] * src[0] * 9 + 16;
+ int sum = src[0] * 9;
+
+ uint8x8_t left_context, s, right_context;
+ int16x4_t sum_low, sum_high;
+ int32x4_t sumsq_low, sumsq_high;
+
+ // Sum (+square) the next 6 elements.
+ // Skip [0] because it's included above.
+ for (col = 1; col <= 6; ++col) {
+ sumsq += src[col] * src[col];
+ sum += src[col];
+ }
+
+ // Prime the sums. Later the loop uses the _high values to prime the new
+ // vectors.
+ sumsq_high = vdupq_n_s32(sumsq);
+ sum_high = vdup_n_s16(sum);
+
+ // Manually extend the left border.
+ left_context = vdup_n_u8(src[0]);
+
+ for (col = 0; col < cols; col += 8) {
+ uint8x8_t mask, output;
+ int16x8_t x, y;
+ int32x4_t xy_low, xy_high;
+
+ s = vld1_u8(src + col);
+
+ if (col + 8 == cols) {
+ // Last row. Extend border.
+ right_context = vdup_n_u8(src[col + 7]);
+ } else {
+ right_context = vld1_u8(src + col + 7);
+ }
+
+ x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context));
+ y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context));
+ xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+ xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+ // Catch up to the last sum'd value.
+ sum_low = vdup_lane_s16(sum_high, 3);
+ sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1);
+
+ accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low);
+
+ // Need to do this sequentially because we need the max value from
+ // sum_low.
+ sum_high = vdup_lane_s16(sum_low, 3);
+ sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1);
+
+ accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high);
+
+ mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f);
+
+ output = filter_pixels(vcombine_s16(sum_low, sum_high), s);
+ output = vbsl_u8(mask, output, s);
+
+ vst1_u8(src + col, output);
+
+ left_context = s;
+ }
+
+ src += pitch;
+ }
+}
+
+// Apply filter of (vpx_rv + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s,
+ const int16x8_t rv) {
+ const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+ const int16x8_t sum_s = vaddq_s16(sum, s16);
+ const int16x8_t rounded = vaddq_s16(sum_s, rv);
+
+ return vqshrun_n_s16(rounded, 4);
+}
+
+void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols,
+ int flimit) {
+ int row, col, i;
+ const int32x4_t f = vdupq_n_s32(flimit);
+ uint8x8_t below_context = vdup_n_u8(0);
+
+ // 8 columns are processed at a time.
+ // If rows is less than 8 the bottom border extension fails.
+ assert(cols % 8 == 0);
+ assert(rows >= 8);
+
+ // Load and keep the first 8 values in memory. Process a vertical stripe that
+ // is 8 wide.
+ for (col = 0; col < cols; col += 8) {
+ uint8x8_t s, above_context[8];
+ int16x8_t sum, sum_tmp;
+ int32x4_t sumsq_low, sumsq_high;
+
+ // Load and extend the top border.
+ s = vld1_u8(dst);
+ for (i = 0; i < 8; i++) {
+ above_context[i] = s;
+ }
+
+ sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s));
+
+ // sum * 9
+ sum = vmulq_n_s16(sum_tmp, 9);
+
+ // (sum * 9) * sum == sum * sum * 9
+ sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp));
+ sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp));
+
+ // Load and discard the next 6 values to prime sum and sumsq.
+ for (i = 1; i <= 6; ++i) {
+ const uint8x8_t a = vld1_u8(dst + i * pitch);
+ const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a));
+ sum = vaddq_s16(sum, b);
+
+ sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b));
+ sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b));
+ }
+
+ for (row = 0; row < rows; ++row) {
+ uint8x8_t mask, output;
+ int16x8_t x, y;
+ int32x4_t xy_low, xy_high;
+
+ s = vld1_u8(dst + row * pitch);
+
+ // Extend the bottom border.
+ if (row + 7 < rows) {
+ below_context = vld1_u8(dst + (row + 7) * pitch);
+ }
+
+ x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0]));
+ y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0]));
+ xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+ xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+ sum = vaddq_s16(sum, x);
+
+ sumsq_low = vaddq_s32(sumsq_low, xy_low);
+ sumsq_high = vaddq_s32(sumsq_high, xy_high);
+
+ mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low,
+ sumsq_high, f);
+
+ output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127)));
+ output = vbsl_u8(mask, output, s);
+
+ vst1_u8(dst + row * pitch, output);
+
+ above_context[0] = above_context[1];
+ above_context[1] = above_context[2];
+ above_context[2] = above_context[3];
+ above_context[3] = above_context[4];
+ above_context[4] = above_context[5];
+ above_context[5] = above_context[6];
+ above_context[6] = above_context[7];
+ above_context[7] = s;
+ }
+
+ dst += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c
new file mode 100644
index 0000000000..fde71ff30d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
+
+// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
+// functions.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+ __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+ vpx_fdct16x16_c(input, output, stride);
+}
+
+#else
+
+// Main body of fdct16x16.
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+ int16x8_t *out /*[16]*/) {
+ int16x8_t s[8];
+ int16x8_t x[4];
+ int16x8_t step[8];
+
+ // stage 1
+ // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+ // even_results);"
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+ &out[8]);
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
+
+ // Stage 2
+ // Re-using source s5/s6
+ // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+ // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+ butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], s[5]);
+ x[1] = vsubq_s16(s[4], s[5]);
+ x[2] = vsubq_s16(s[7], s[6]);
+ x[3] = vaddq_s16(s[7], s[6]);
+
+ // Stage 4
+ // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+ // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
+
+ // step 2
+ // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+ // That file distinguished between "in_high" and "step1" but the only
+ // difference is that "in_high" is the first 8 values and "step 1" is the
+ // second. Here, since they are all in one array, "step1" values are += 8.
+
+ // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+ // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+ // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+ // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+ butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+ butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+ // step 3
+ s[0] = vaddq_s16(in[8], s[3]);
+ s[1] = vaddq_s16(in[9], s[2]);
+ x[0] = vsubq_s16(in[9], s[2]);
+ x[1] = vsubq_s16(in[8], s[3]);
+ x[2] = vsubq_s16(in[15], s[4]);
+ x[3] = vsubq_s16(in[14], s[5]);
+ s[6] = vaddq_s16(in[14], s[5]);
+ s[7] = vaddq_s16(in[15], s[4]);
+
+ // step 4
+ // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] *
+ // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+ // * cospi_8_64)
+ butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
+
+ // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] *
+ // cospi_24_64)
+ butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
+
+ // step 5
+ step[0] = vaddq_s16(s[0], s[1]);
+ step[1] = vsubq_s16(s[0], s[1]);
+ step[2] = vaddq_s16(x[1], s[2]);
+ step[3] = vsubq_s16(x[1], s[2]);
+ step[4] = vsubq_s16(x[2], s[5]);
+ step[5] = vaddq_s16(x[2], s[5]);
+ step[6] = vsubq_s16(s[7], s[6]);
+ step[7] = vaddq_s16(s[7], s[6]);
+
+ // step 6
+ // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+ // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+ butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
+ &out[7]);
+ // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64)
+ // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+ butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
+ &out[15]);
+
+ // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+ // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64)
+ butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
+ &out[3]);
+
+ // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+ // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+ butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
+ &out[11]);
+}
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int16x8_t temp0[16];
+ int16x8_t temp1[16];
+ int16x8_t temp2[16];
+ int16x8_t temp3[16];
+
+ // Left half.
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp1);
+ vpx_fdct8x16_body(temp1, temp0);
+
+ // Right half.
+ load_cross(input + 8, stride, temp1);
+ scale_input(temp1, temp2);
+ vpx_fdct8x16_body(temp2, temp1);
+
+ // Transpose top left and top right quarters into one contiguous location to
+ // process to the top half.
+
+ transpose_s16_8x8q(&temp0[0], &temp2[0]);
+ transpose_s16_8x8q(&temp1[0], &temp2[8]);
+ partial_round_shift(temp2);
+ cross_input(temp2, temp3);
+ vpx_fdct8x16_body(temp3, temp2);
+ transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
+ &temp2[5], &temp2[6], &temp2[7]);
+ transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
+ &temp2[13], &temp2[14], &temp2[15]);
+ store(output, temp2);
+ store(output + 8, temp2 + 8);
+ output += 8 * 16;
+
+ // Transpose bottom left and bottom right quarters into one contiguous
+ // location to process to the bottom half.
+ transpose_s16_8x8q(&temp0[8], &temp1[0]);
+
+ transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+ &temp1[13], &temp1[14], &temp1[15]);
+ partial_round_shift(temp1);
+ cross_input(temp1, temp0);
+ vpx_fdct8x16_body(temp0, temp1);
+ transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
+ &temp1[5], &temp1[6], &temp1[7]);
+ transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+ &temp1[13], &temp1[14], &temp1[15]);
+ store(output, temp1);
+ store(output + 8, temp1 + 8);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ int32x4_t sl[8];
+ int32x4_t sr[8];
+ int32x4_t xl[4];
+ int32x4_t xr[4];
+ int32x4_t inl[8];
+ int32x4_t inr[8];
+ int32x4_t stepl[8];
+ int32x4_t stepr[8];
+
+ // stage 1
+ // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+ // even_results);"
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // Copy values 8-15 as we're storing in-place
+ inl[0] = left[8];
+ inr[0] = right[8];
+ inl[1] = left[9];
+ inr[1] = right[9];
+ inl[2] = left[10];
+ inr[2] = right[10];
+ inl[3] = left[11];
+ inr[3] = right[11];
+ inl[4] = left[12];
+ inr[4] = right[12];
+ inl[5] = left[13];
+ inr[5] = right[13];
+ inl[6] = left[14];
+ inr[6] = right[14];
+ inl[7] = left[15];
+ inr[7] = right[15];
+
+ // fdct4(step, step);
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[8], &right[8]);
+
+ // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+ // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+ cospi_24_64, &left[4], &right[4],
+ &left[12], &right[12]);
+
+ // Stage 2
+ // Re-using source s5/s6
+ // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+ // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+ &sr[6], &sl[5], &sr[5]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], sl[5]);
+ xr[0] = vaddq_s32(sr[4], sr[5]);
+ xl[1] = vsubq_s32(sl[4], sl[5]);
+ xr[1] = vsubq_s32(sr[4], sr[5]);
+ xl[2] = vsubq_s32(sl[7], sl[6]);
+ xr[2] = vsubq_s32(sr[7], sr[6]);
+ xl[3] = vaddq_s32(sl[7], sl[6]);
+ xr[3] = vaddq_s32(sr[7], sr[6]);
+
+ // Stage 4
+ // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64)
+ // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+ cospi_28_64, &left[2], &right[2],
+ &left[14], &right[14]);
+ // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+ // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+ cospi_12_64, &left[10], &right[10],
+ &left[6], &right[6]);
+
+ // step 2
+ // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+ // That file distinguished between "in_high" and "step1" but the only
+ // difference is that "in_high" is the first 8 values and "step 1" is the
+ // second. Here, since they are all in one array, "step1" values are += 8.
+
+ // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+ // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+ // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+ // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+ &sl[5], &sr[5], &sl[2], &sr[2]);
+ butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+ &sl[4], &sr[4], &sl[3], &sr[3]);
+
+ // step 3
+ sl[0] = vaddq_s32(inl[0], sl[3]);
+ sr[0] = vaddq_s32(inr[0], sr[3]);
+ sl[1] = vaddq_s32(inl[1], sl[2]);
+ sr[1] = vaddq_s32(inr[1], sr[2]);
+ xl[0] = vsubq_s32(inl[1], sl[2]);
+ xr[0] = vsubq_s32(inr[1], sr[2]);
+ xl[1] = vsubq_s32(inl[0], sl[3]);
+ xr[1] = vsubq_s32(inr[0], sr[3]);
+ xl[2] = vsubq_s32(inl[7], sl[4]);
+ xr[2] = vsubq_s32(inr[7], sr[4]);
+ xl[3] = vsubq_s32(inl[6], sl[5]);
+ xr[3] = vsubq_s32(inr[6], sr[5]);
+ sl[6] = vaddq_s32(inl[6], sl[5]);
+ sr[6] = vaddq_s32(inr[6], sr[5]);
+ sl[7] = vaddq_s32(inl[7], sl[4]);
+ sr[7] = vaddq_s32(inr[7], sr[4]);
+
+ // step 4
+ // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] *
+ // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+ // * cospi_8_64)
+ butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+ cospi_24_64, &sl[6], &sr[6], &sl[1],
+ &sr[1]);
+ // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+ // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] *
+ // cospi_24_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+ cospi_8_64, &sl[2], &sr[2], &sl[5],
+ &sr[5]);
+
+ // step 5
+ stepl[0] = vaddq_s32(sl[0], sl[1]);
+ stepr[0] = vaddq_s32(sr[0], sr[1]);
+ stepl[1] = vsubq_s32(sl[0], sl[1]);
+ stepr[1] = vsubq_s32(sr[0], sr[1]);
+ stepl[2] = vaddq_s32(xl[1], sl[2]);
+ stepr[2] = vaddq_s32(xr[1], sr[2]);
+ stepl[3] = vsubq_s32(xl[1], sl[2]);
+ stepr[3] = vsubq_s32(xr[1], sr[2]);
+ stepl[4] = vsubq_s32(xl[2], sl[5]);
+ stepr[4] = vsubq_s32(xr[2], sr[5]);
+ stepl[5] = vaddq_s32(xl[2], sl[5]);
+ stepr[5] = vaddq_s32(xr[2], sr[5]);
+ stepl[6] = vsubq_s32(sl[7], sl[6]);
+ stepr[6] = vsubq_s32(sr[7], sr[6]);
+ stepl[7] = vaddq_s32(sl[7], sl[6]);
+ stepr[7] = vaddq_s32(sr[7], sr[6]);
+
+ // step 6
+ // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+ // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+ cospi_18_64, cospi_14_64, &left[9],
+ &right[9], &left[7], &right[7]);
+ // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64)
+ // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+ cospi_2_64, cospi_30_64, &left[1],
+ &right[1], &left[15], &right[15]);
+ // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+ // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+ cospi_26_64, cospi_6_64, &left[13],
+ &right[13], &left[3], &right[3]);
+ // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+ // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+ butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+ cospi_10_64, cospi_22_64, &left[5],
+ &right[5], &left[11], &right[11]);
+}
+
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[16];
+ int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16],
+ right3[16], right4[16];
+
+ // Left half.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
+
+ // right half.
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
+
+ // Transpose top left and top right quarters into one contiguous location to
+ // process to the top half.
+
+ transpose_s32_8x8_2(left1, right1, left3, right3);
+ transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+
+ highbd_partial_round_shift(left3, right3);
+ highbd_cross_input(left3, right3, left1, right1);
+ vpx_highbd_fdct8x16_body(left1, right1);
+
+ // Transpose bottom left and bottom right quarters into one contiguous
+ // location to process to the bottom half.
+
+ highbd_partial_round_shift(left4, right4);
+ highbd_cross_input(left4, right4, left2, right2);
+ vpx_highbd_fdct8x16_body(left2, right2);
+
+ transpose_s32_8x8_2(left1, right1, left3, right3);
+ transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+ store16_s32(output, left3);
+ output += 4;
+ store16_s32(output, right3);
+ output += 4;
+
+ store16_s32(output, left4);
+ output += 4;
+ store16_s32(output, right4);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+ // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h
new file mode 100644
index 0000000000..cd58675ca4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+
+#include <arm_neon.h>
+
+#include "fdct_neon.h"
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+ b[0] = vld1q_s16(a);
+ a += stride;
+ b[1] = vld1q_s16(a);
+ a += stride;
+ b[2] = vld1q_s16(a);
+ a += stride;
+ b[3] = vld1q_s16(a);
+ a += stride;
+ b[4] = vld1q_s16(a);
+ a += stride;
+ b[5] = vld1q_s16(a);
+ a += stride;
+ b[6] = vld1q_s16(a);
+ a += stride;
+ b[7] = vld1q_s16(a);
+ a += stride;
+ b[8] = vld1q_s16(a);
+ a += stride;
+ b[9] = vld1q_s16(a);
+ a += stride;
+ b[10] = vld1q_s16(a);
+ a += stride;
+ b[11] = vld1q_s16(a);
+ a += stride;
+ b[12] = vld1q_s16(a);
+ a += stride;
+ b[13] = vld1q_s16(a);
+ a += stride;
+ b[14] = vld1q_s16(a);
+ a += stride;
+ b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+ store_s16q_to_tran_low(a, b[0]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[1]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[2]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[3]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[4]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[5]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[6]);
+ a += 16;
+ store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void scale_input(const int16x8_t *a /*[16]*/,
+ int16x8_t *b /*[16]*/) {
+ b[0] = vshlq_n_s16(a[0], 2);
+ b[1] = vshlq_n_s16(a[1], 2);
+ b[2] = vshlq_n_s16(a[2], 2);
+ b[3] = vshlq_n_s16(a[3], 2);
+ b[4] = vshlq_n_s16(a[4], 2);
+ b[5] = vshlq_n_s16(a[5], 2);
+ b[6] = vshlq_n_s16(a[6], 2);
+ b[7] = vshlq_n_s16(a[7], 2);
+
+ b[8] = vshlq_n_s16(a[8], 2);
+ b[9] = vshlq_n_s16(a[9], 2);
+ b[10] = vshlq_n_s16(a[10], 2);
+ b[11] = vshlq_n_s16(a[11], 2);
+ b[12] = vshlq_n_s16(a[12], 2);
+ b[13] = vshlq_n_s16(a[13], 2);
+ b[14] = vshlq_n_s16(a[14], 2);
+ b[15] = vshlq_n_s16(a[15], 2);
+}
+
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+ int16x8_t *b /*[16]*/) {
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+}
+
+static INLINE void load_cross(const int16_t *a, int stride,
+ int16x8_t *b /*[16]*/) {
+ b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+ b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+ b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+ b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+ b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+ b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+ b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+ b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+
+ b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+ b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+ b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+ b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+ b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+ b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+ b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+ b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+ const int16x8_t one = vdupq_n_s16(1);
+ a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+ a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+ a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+ a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+ a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+ a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+ a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+ a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+ a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+ a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+ a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+ a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+ a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+ a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+ a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+ a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
+ int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+ left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+ left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+ left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+ left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+ left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+ left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+ left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+ left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+
+ right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+ right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+ right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+ right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+ right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+ right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+ right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+ right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+ right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/,
+ int32x4_t *a_right /*[16]*/,
+ int32x4_t *b_left /*[16]*/,
+ int32x4_t *b_right /*[16]*/) {
+ b_left[0] = vaddq_s32(a_left[0], a_left[15]);
+ b_left[1] = vaddq_s32(a_left[1], a_left[14]);
+ b_left[2] = vaddq_s32(a_left[2], a_left[13]);
+ b_left[3] = vaddq_s32(a_left[3], a_left[12]);
+ b_left[4] = vaddq_s32(a_left[4], a_left[11]);
+ b_left[5] = vaddq_s32(a_left[5], a_left[10]);
+ b_left[6] = vaddq_s32(a_left[6], a_left[9]);
+ b_left[7] = vaddq_s32(a_left[7], a_left[8]);
+
+ b_right[0] = vaddq_s32(a_right[0], a_right[15]);
+ b_right[1] = vaddq_s32(a_right[1], a_right[14]);
+ b_right[2] = vaddq_s32(a_right[2], a_right[13]);
+ b_right[3] = vaddq_s32(a_right[3], a_right[12]);
+ b_right[4] = vaddq_s32(a_right[4], a_right[11]);
+ b_right[5] = vaddq_s32(a_right[5], a_right[10]);
+ b_right[6] = vaddq_s32(a_right[6], a_right[9]);
+ b_right[7] = vaddq_s32(a_right[7], a_right[8]);
+
+ b_left[8] = vsubq_s32(a_left[7], a_left[8]);
+ b_left[9] = vsubq_s32(a_left[6], a_left[9]);
+ b_left[10] = vsubq_s32(a_left[5], a_left[10]);
+ b_left[11] = vsubq_s32(a_left[4], a_left[11]);
+ b_left[12] = vsubq_s32(a_left[3], a_left[12]);
+ b_left[13] = vsubq_s32(a_left[2], a_left[13]);
+ b_left[14] = vsubq_s32(a_left[1], a_left[14]);
+ b_left[15] = vsubq_s32(a_left[0], a_left[15]);
+
+ b_right[8] = vsubq_s32(a_right[7], a_right[8]);
+ b_right[9] = vsubq_s32(a_right[6], a_right[9]);
+ b_right[10] = vsubq_s32(a_right[5], a_right[10]);
+ b_right[11] = vsubq_s32(a_right[4], a_right[11]);
+ b_right[12] = vsubq_s32(a_right[3], a_right[12]);
+ b_right[13] = vsubq_s32(a_right[2], a_right[13]);
+ b_right[14] = vsubq_s32(a_right[1], a_right[14]);
+ b_right[15] = vsubq_s32(a_right[0], a_right[15]);
+}
+
+static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
+ int32x4_t *right /* [16] */) {
+ const int32x4_t one = vdupq_n_s32(1);
+ left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
+ left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
+ left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
+ left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
+ left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
+ left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
+ left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
+ left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
+ left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
+ left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
+ left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
+ left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
+ left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
+ left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
+ left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
+ left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
+
+ right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+ right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+ right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+ right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+ right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+ right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+ right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+ right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+ right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+ right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+ right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+ right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+ right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+ right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+ right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
+ right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
+}
+
+// Store 16 32x4 vectors, assuming stride == 16.
+static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
+ vst1q_s32(a, b[0]);
+ a += 16;
+ vst1q_s32(a, b[1]);
+ a += 16;
+ vst1q_s32(a, b[2]);
+ a += 16;
+ vst1q_s32(a, b[3]);
+ a += 16;
+ vst1q_s32(a, b[4]);
+ a += 16;
+ vst1q_s32(a, b[5]);
+ a += 16;
+ vst1q_s32(a, b[6]);
+ a += 16;
+ vst1q_s32(a, b[7]);
+ a += 16;
+ vst1q_s32(a, b[8]);
+ a += 16;
+ vst1q_s32(a, b[9]);
+ a += 16;
+ vst1q_s32(a, b[10]);
+ a += 16;
+ vst1q_s32(a, b[11]);
+ a += 16;
+ vst1q_s32(a, b[12]);
+ a += 16;
+ vst1q_s32(a, b[13]);
+ a += 16;
+ vst1q_s32(a, b[14]);
+ a += 16;
+ vst1q_s32(a, b[15]);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c
new file mode 100644
index 0000000000..a91730ce8b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct32x32_neon.h"
+
+// Most gcc 4.9 distributions outside of Android do not generate correct code
+// for this function.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+ __GNUC__ == 4 && __GNUC_MINOR__ <= 9
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+ vpx_fdct32x32_c(input, output, stride);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct32x32_rd_c(input, output, stride);
+}
+
+#else
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int16x8_t temp0[32];
+ int16x8_t temp1[32];
+ int16x8_t temp2[32];
+ int16x8_t temp3[32];
+ int16x8_t temp4[32];
+ int16x8_t temp5[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp1);
+
+ load_cross(input + 8, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp2);
+
+ load_cross(input + 16, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp3);
+
+ load_cross(input + 24, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s16_8x8q(&temp1[0], &temp0[0]);
+ transpose_s16_8x8q(&temp2[0], &temp0[8]);
+ transpose_s16_8x8q(&temp3[0], &temp0[16]);
+ transpose_s16_8x8q(&temp4[0], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output, temp5);
+
+ // Second row of 8x32.
+ transpose_s16_8x8q(&temp1[8], &temp0[0]);
+ transpose_s16_8x8q(&temp2[8], &temp0[8]);
+ transpose_s16_8x8q(&temp3[8], &temp0[16]);
+ transpose_s16_8x8q(&temp4[8], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 8 * 32, temp5);
+
+ // Third row of 8x32
+ transpose_s16_8x8q(&temp1[16], &temp0[0]);
+ transpose_s16_8x8q(&temp2[16], &temp0[8]);
+ transpose_s16_8x8q(&temp3[16], &temp0[16]);
+ transpose_s16_8x8q(&temp4[16], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 16 * 32, temp5);
+
+ // Final row of 8x32.
+ transpose_s16_8x8q(&temp1[24], &temp0[0]);
+ transpose_s16_8x8q(&temp2[24], &temp0[8]);
+ transpose_s16_8x8q(&temp3[24], &temp0[16]);
+ transpose_s16_8x8q(&temp4[24], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 24 * 32, temp5);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int16x8_t temp1[32];
+ int16x8_t temp2[32];
+ int16x8_t temp3[32];
+ int16x8_t temp4[32];
+ int16x8_t temp5[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp1);
+
+ load_cross(input + 8, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp2);
+
+ load_cross(input + 16, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp3);
+
+ load_cross(input + 24, stride, temp0);
+ scale_input(temp0, temp5);
+ dct_body_first_pass(temp5, temp4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s16_8x8q(&temp1[0], &temp0[0]);
+ transpose_s16_8x8q(&temp2[0], &temp0[8]);
+ transpose_s16_8x8q(&temp3[0], &temp0[16]);
+ transpose_s16_8x8q(&temp4[0], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output, temp5);
+
+ // Second row of 8x32.
+ transpose_s16_8x8q(&temp1[8], &temp0[0]);
+ transpose_s16_8x8q(&temp2[8], &temp0[8]);
+ transpose_s16_8x8q(&temp3[8], &temp0[16]);
+ transpose_s16_8x8q(&temp4[8], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 8 * 32, temp5);
+
+ // Third row of 8x32
+ transpose_s16_8x8q(&temp1[16], &temp0[0]);
+ transpose_s16_8x8q(&temp2[16], &temp0[8]);
+ transpose_s16_8x8q(&temp3[16], &temp0[16]);
+ transpose_s16_8x8q(&temp4[16], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 16 * 32, temp5);
+
+ // Final row of 8x32.
+ transpose_s16_8x8q(&temp1[24], &temp0[0]);
+ transpose_s16_8x8q(&temp2[24], &temp0[8]);
+ transpose_s16_8x8q(&temp3[24], &temp0[16]);
+ transpose_s16_8x8q(&temp4[24], &temp0[24]);
+
+ dct_body_second_pass_rd(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 24 * 32, temp5);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+ right3[32], right4[32];
+ int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+ left8[32], right8[32];
+ int32x4_t temp1[32], temp2[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ highbd_dct8x32_body_first_pass(left1, right1);
+ highbd_partial_sub_round_shift(left1, right1);
+
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ highbd_dct8x32_body_first_pass(left2, right2);
+ highbd_partial_sub_round_shift(left2, right2);
+
+ load_cross(input + 16, stride, temp0);
+ highbd_scale_input(temp0, left3, right3);
+ highbd_dct8x32_body_first_pass(left3, right3);
+ highbd_partial_sub_round_shift(left3, right3);
+
+ load_cross(input + 24, stride, temp0);
+ highbd_scale_input(temp0, left4, right4);
+ highbd_dct8x32_body_first_pass(left4, right4);
+ highbd_partial_sub_round_shift(left4, right4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s32_8x8_2(left1, right1, temp1, temp2);
+ transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left5, right5);
+ highbd_dct8x32_body_second_pass(left5, right5);
+ highbd_partial_add_round_shift(left5, right5);
+
+ // Second row of 8x32.
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left6, right6);
+ highbd_dct8x32_body_second_pass(left6, right6);
+ highbd_partial_add_round_shift(left6, right6);
+
+ // Third row of 8x32
+ transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left7, right7);
+ highbd_dct8x32_body_second_pass(left7, right7);
+ highbd_partial_add_round_shift(left7, right7);
+
+ // Final row of 8x32.
+ transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left8, right8);
+ highbd_dct8x32_body_second_pass(left8, right8);
+ highbd_partial_add_round_shift(left8, right8);
+
+ // Final transpose
+ transpose_s32_8x8_2(left5, right5, left1, right1);
+ transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+ transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+ transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+ transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+ transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+ transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+ transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+ transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+ transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+ transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+ transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+ transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+ transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+ store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+ right4);
+}
+
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int16x8_t temp0[32];
+ int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+ right3[32], right4[32];
+ int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+ left8[32], right8[32];
+ int32x4_t temp1[32], temp2[32];
+
+ // Process in 8x32 columns.
+ load_cross(input, stride, temp0);
+ highbd_scale_input(temp0, left1, right1);
+ highbd_dct8x32_body_first_pass(left1, right1);
+ highbd_partial_sub_round_shift(left1, right1);
+
+ load_cross(input + 8, stride, temp0);
+ highbd_scale_input(temp0, left2, right2);
+ highbd_dct8x32_body_first_pass(left2, right2);
+ highbd_partial_sub_round_shift(left2, right2);
+
+ load_cross(input + 16, stride, temp0);
+ highbd_scale_input(temp0, left3, right3);
+ highbd_dct8x32_body_first_pass(left3, right3);
+ highbd_partial_sub_round_shift(left3, right3);
+
+ load_cross(input + 24, stride, temp0);
+ highbd_scale_input(temp0, left4, right4);
+ highbd_dct8x32_body_first_pass(left4, right4);
+ highbd_partial_sub_round_shift(left4, right4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_s32_8x8_2(left1, right1, temp1, temp2);
+ transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left5, right5);
+ highbd_dct8x32_body_second_pass_rd(left5, right5);
+
+ // Second row of 8x32.
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left6, right6);
+ highbd_dct8x32_body_second_pass_rd(left6, right6);
+
+ // Third row of 8x32
+ transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left7, right7);
+ highbd_dct8x32_body_second_pass_rd(left7, right7);
+
+ // Final row of 8x32.
+ transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+ transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+ transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+ transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+ highbd_cross_input(temp1, temp2, left8, right8);
+ highbd_dct8x32_body_second_pass_rd(left8, right8);
+
+ // Final transpose
+ transpose_s32_8x8_2(left5, right5, left1, right1);
+ transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+ transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+ transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+ transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+ transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+ transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+ transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+ transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+ transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+ transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+ transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+ transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+ transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+ transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+ store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+ right4);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+ // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h
new file mode 100644
index 0000000000..3b9e64c6df
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h
@@ -0,0 +1,2919 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+// Load & cross the first 8 and last 8, then the middle
+static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
+ b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+ b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+ b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+ b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+ b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+ b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+ b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+ b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+
+ b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+ b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+ b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+ b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+ b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+ b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+ b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+ b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+
+ b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+ b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+ b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+ b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+ b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+ b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+ b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+ b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+
+ b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+ b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+ b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+ b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+ b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+ b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+ b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+ b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+}
+
+#define STORE_S16(src, index, dest) \
+ do { \
+ store_s16q_to_tran_low(dest, src[index]); \
+ dest += 8; \
+ } while (0)
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+ STORE_S16(b, 0, a);
+ STORE_S16(b, 8, a);
+ STORE_S16(b, 16, a);
+ STORE_S16(b, 24, a);
+ STORE_S16(b, 1, a);
+ STORE_S16(b, 9, a);
+ STORE_S16(b, 17, a);
+ STORE_S16(b, 25, a);
+ STORE_S16(b, 2, a);
+ STORE_S16(b, 10, a);
+ STORE_S16(b, 18, a);
+ STORE_S16(b, 26, a);
+ STORE_S16(b, 3, a);
+ STORE_S16(b, 11, a);
+ STORE_S16(b, 19, a);
+ STORE_S16(b, 27, a);
+ STORE_S16(b, 4, a);
+ STORE_S16(b, 12, a);
+ STORE_S16(b, 20, a);
+ STORE_S16(b, 28, a);
+ STORE_S16(b, 5, a);
+ STORE_S16(b, 13, a);
+ STORE_S16(b, 21, a);
+ STORE_S16(b, 29, a);
+ STORE_S16(b, 6, a);
+ STORE_S16(b, 14, a);
+ STORE_S16(b, 22, a);
+ STORE_S16(b, 30, a);
+ STORE_S16(b, 7, a);
+ STORE_S16(b, 15, a);
+ STORE_S16(b, 23, a);
+ STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+static INLINE void scale_input(const int16x8_t *in /*32*/,
+ int16x8_t *out /*32*/) {
+ out[0] = vshlq_n_s16(in[0], 2);
+ out[1] = vshlq_n_s16(in[1], 2);
+ out[2] = vshlq_n_s16(in[2], 2);
+ out[3] = vshlq_n_s16(in[3], 2);
+ out[4] = vshlq_n_s16(in[4], 2);
+ out[5] = vshlq_n_s16(in[5], 2);
+ out[6] = vshlq_n_s16(in[6], 2);
+ out[7] = vshlq_n_s16(in[7], 2);
+
+ out[8] = vshlq_n_s16(in[8], 2);
+ out[9] = vshlq_n_s16(in[9], 2);
+ out[10] = vshlq_n_s16(in[10], 2);
+ out[11] = vshlq_n_s16(in[11], 2);
+ out[12] = vshlq_n_s16(in[12], 2);
+ out[13] = vshlq_n_s16(in[13], 2);
+ out[14] = vshlq_n_s16(in[14], 2);
+ out[15] = vshlq_n_s16(in[15], 2);
+
+ out[16] = vshlq_n_s16(in[16], 2);
+ out[17] = vshlq_n_s16(in[17], 2);
+ out[18] = vshlq_n_s16(in[18], 2);
+ out[19] = vshlq_n_s16(in[19], 2);
+ out[20] = vshlq_n_s16(in[20], 2);
+ out[21] = vshlq_n_s16(in[21], 2);
+ out[22] = vshlq_n_s16(in[22], 2);
+ out[23] = vshlq_n_s16(in[23], 2);
+
+ out[24] = vshlq_n_s16(in[24], 2);
+ out[25] = vshlq_n_s16(in[25], 2);
+ out[26] = vshlq_n_s16(in[26], 2);
+ out[27] = vshlq_n_s16(in[27], 2);
+ out[28] = vshlq_n_s16(in[28], 2);
+ out[29] = vshlq_n_s16(in[29], 2);
+ out[30] = vshlq_n_s16(in[30], 2);
+ out[31] = vshlq_n_s16(in[31], 2);
+}
+
+static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ a[0] = vaddq_s16(in[0], in[15]);
+ a[1] = vaddq_s16(in[1], in[14]);
+ a[2] = vaddq_s16(in[2], in[13]);
+ a[3] = vaddq_s16(in[3], in[12]);
+ a[4] = vaddq_s16(in[4], in[11]);
+ a[5] = vaddq_s16(in[5], in[10]);
+ a[6] = vaddq_s16(in[6], in[9]);
+ a[7] = vaddq_s16(in[7], in[8]);
+
+ a[8] = vsubq_s16(in[7], in[8]);
+ a[9] = vsubq_s16(in[6], in[9]);
+ a[10] = vsubq_s16(in[5], in[10]);
+ a[11] = vsubq_s16(in[4], in[11]);
+ a[12] = vsubq_s16(in[3], in[12]);
+ a[13] = vsubq_s16(in[2], in[13]);
+ a[14] = vsubq_s16(in[1], in[14]);
+ a[15] = vsubq_s16(in[0], in[15]);
+
+ a[16] = in[16];
+ a[17] = in[17];
+ a[18] = in[18];
+ a[19] = in[19];
+
+ butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
+ &a[20]);
+ butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
+ &a[21]);
+ butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
+ &a[22]);
+ butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
+ &a[23]);
+
+ a[28] = in[28];
+ a[29] = in[29];
+ a[30] = in[30];
+ a[31] = in[31];
+
+ // Stage 3.
+ b[0] = vaddq_s16(a[0], a[7]);
+ b[1] = vaddq_s16(a[1], a[6]);
+ b[2] = vaddq_s16(a[2], a[5]);
+ b[3] = vaddq_s16(a[3], a[4]);
+
+ b[4] = vsubq_s16(a[3], a[4]);
+ b[5] = vsubq_s16(a[2], a[5]);
+ b[6] = vsubq_s16(a[1], a[6]);
+ b[7] = vsubq_s16(a[0], a[7]);
+
+ b[8] = a[8];
+ b[9] = a[9];
+
+ butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+ butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+ b[14] = a[14];
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(in[16], a[23]);
+ b[17] = vaddq_s16(in[17], a[22]);
+ b[18] = vaddq_s16(in[18], a[21]);
+ b[19] = vaddq_s16(in[19], a[20]);
+
+ b[20] = vsubq_s16(in[19], a[20]);
+ b[21] = vsubq_s16(in[18], a[21]);
+ b[22] = vsubq_s16(in[17], a[22]);
+ b[23] = vsubq_s16(in[16], a[23]);
+
+ b[24] = vsubq_s16(in[31], a[24]);
+ b[25] = vsubq_s16(in[30], a[25]);
+ b[26] = vsubq_s16(in[29], a[26]);
+ b[27] = vsubq_s16(in[28], a[27]);
+
+ b[28] = vaddq_s16(in[28], a[27]);
+ b[29] = vaddq_s16(in[29], a[26]);
+ b[30] = vaddq_s16(in[30], a[25]);
+ b[31] = vaddq_s16(in[31], a[24]);
+
+ // Stage 4.
+ a[0] = vaddq_s16(b[0], b[3]);
+ a[1] = vaddq_s16(b[1], b[2]);
+ a[2] = vsubq_s16(b[1], b[2]);
+ a[3] = vsubq_s16(b[0], b[3]);
+
+ a[4] = b[4];
+
+ butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+ a[7] = b[7];
+
+ a[8] = vaddq_s16(b[8], b[11]);
+ a[9] = vaddq_s16(b[9], b[10]);
+ a[10] = vsubq_s16(b[9], b[10]);
+ a[11] = vsubq_s16(b[8], b[11]);
+ a[12] = vsubq_s16(b[15], b[12]);
+ a[13] = vsubq_s16(b[14], b[13]);
+ a[14] = vaddq_s16(b[14], b[13]);
+ a[15] = vaddq_s16(b[15], b[12]);
+
+ a[16] = b[16];
+ a[17] = b[17];
+
+ butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
+ butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
+ butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
+ butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
+
+ a[22] = b[22];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[25] = b[25];
+
+ a[30] = b[30];
+ a[31] = b[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+ butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
+
+ b[4] = vaddq_s16(a[4], a[5]);
+ b[5] = vsubq_s16(a[4], a[5]);
+ b[6] = vsubq_s16(a[7], a[6]);
+ b[7] = vaddq_s16(a[7], a[6]);
+
+ b[8] = a[8];
+
+ butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
+ butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
+
+ b[11] = a[11];
+ b[12] = a[12];
+
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(a[19], a[16]);
+ b[17] = vaddq_s16(a[18], a[17]);
+ b[18] = vsubq_s16(a[17], a[18]);
+ b[19] = vsubq_s16(a[16], a[19]);
+ b[20] = vsubq_s16(a[23], a[20]);
+ b[21] = vsubq_s16(a[22], a[21]);
+ b[22] = vaddq_s16(a[21], a[22]);
+ b[23] = vaddq_s16(a[20], a[23]);
+ b[24] = vaddq_s16(a[27], a[24]);
+ b[25] = vaddq_s16(a[26], a[25]);
+ b[26] = vsubq_s16(a[25], a[26]);
+ b[27] = vsubq_s16(a[24], a[27]);
+ b[28] = vsubq_s16(a[31], a[28]);
+ b[29] = vsubq_s16(a[30], a[29]);
+ b[30] = vaddq_s16(a[29], a[30]);
+ b[31] = vaddq_s16(a[28], a[31]);
+
+ // Stage 6.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+
+ butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
+ butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
+
+ a[8] = vaddq_s16(b[8], b[9]);
+ a[9] = vsubq_s16(b[8], b[9]);
+ a[10] = vsubq_s16(b[11], b[10]);
+ a[11] = vaddq_s16(b[11], b[10]);
+ a[12] = vaddq_s16(b[12], b[13]);
+ a[13] = vsubq_s16(b[12], b[13]);
+ a[14] = vsubq_s16(b[15], b[14]);
+ a[15] = vaddq_s16(b[15], b[14]);
+
+ a[16] = b[16];
+ a[19] = b[19];
+ a[20] = b[20];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[27] = b[27];
+ a[28] = b[28];
+ a[31] = b[31];
+
+ butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
+ butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
+ butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
+
+ // Stage 7.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+ b[4] = a[4];
+ b[5] = a[5];
+ b[6] = a[6];
+ b[7] = a[7];
+
+ butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
+ butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
+ butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
+ butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
+
+ b[16] = vaddq_s16(a[16], a[17]);
+ b[17] = vsubq_s16(a[16], a[17]);
+ b[18] = vsubq_s16(a[19], a[18]);
+ b[19] = vaddq_s16(a[19], a[18]);
+ b[20] = vaddq_s16(a[20], a[21]);
+ b[21] = vsubq_s16(a[20], a[21]);
+ b[22] = vsubq_s16(a[23], a[22]);
+ b[23] = vaddq_s16(a[23], a[22]);
+ b[24] = vaddq_s16(a[24], a[25]);
+ b[25] = vsubq_s16(a[24], a[25]);
+ b[26] = vsubq_s16(a[27], a[26]);
+ b[27] = vaddq_s16(a[27], a[26]);
+ b[28] = vaddq_s16(a[28], a[29]);
+ b[29] = vsubq_s16(a[28], a[29]);
+ b[30] = vsubq_s16(a[31], a[30]);
+ b[31] = vaddq_s16(a[31], a[30]);
+
+ // Final stage.
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ out[0] = sub_round_shift_s16(b[0]);
+ out[16] = sub_round_shift_s16(b[1]);
+ out[8] = sub_round_shift_s16(b[2]);
+ out[24] = sub_round_shift_s16(b[3]);
+ out[4] = sub_round_shift_s16(b[4]);
+ out[20] = sub_round_shift_s16(b[5]);
+ out[12] = sub_round_shift_s16(b[6]);
+ out[28] = sub_round_shift_s16(b[7]);
+ out[2] = sub_round_shift_s16(b[8]);
+ out[18] = sub_round_shift_s16(b[9]);
+ out[10] = sub_round_shift_s16(b[10]);
+ out[26] = sub_round_shift_s16(b[11]);
+ out[6] = sub_round_shift_s16(b[12]);
+ out[22] = sub_round_shift_s16(b[13]);
+ out[14] = sub_round_shift_s16(b[14]);
+ out[30] = sub_round_shift_s16(b[15]);
+
+ butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
+ out[1] = sub_round_shift_s16(a[1]);
+ out[31] = sub_round_shift_s16(a[31]);
+
+ butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
+ out[17] = sub_round_shift_s16(a[17]);
+ out[15] = sub_round_shift_s16(a[15]);
+
+ butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
+ out[9] = sub_round_shift_s16(a[9]);
+ out[23] = sub_round_shift_s16(a[23]);
+
+ butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
+ out[25] = sub_round_shift_s16(a[25]);
+ out[7] = sub_round_shift_s16(a[7]);
+
+ butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
+ out[5] = sub_round_shift_s16(a[5]);
+ out[27] = sub_round_shift_s16(a[27]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
+ out[21] = sub_round_shift_s16(a[21]);
+ out[11] = sub_round_shift_s16(a[11]);
+
+ butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
+ out[13] = sub_round_shift_s16(a[13]);
+ out[19] = sub_round_shift_s16(a[19]);
+
+ butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
+ out[29] = sub_round_shift_s16(a[29]);
+ out[3] = sub_round_shift_s16(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element) \
+ do { \
+ dst##_lo[element] = src##_lo[element]; \
+ dst##_hi[element] = src##_hi[element]; \
+ } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
+ do { \
+ c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
+ c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+ } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+ do { \
+ temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
+ temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
+ c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
+ c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
+ } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
+ add_index, sub_index) \
+ do { \
+ butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+ &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
+ sub_index) \
+ do { \
+ butterfly_one_coeff_s32_fast( \
+ a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \
+ a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
+ right_constant, b, add_index, sub_index) \
+ do { \
+ butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
+ a##_lo[right_index], a##_hi[right_index], \
+ left_constant, right_constant, &b##_lo[add_index], \
+ &b##_hi[add_index], &b##_lo[sub_index], \
+ &b##_hi[sub_index]); \
+ } while (0)
+
+static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+ int32x4_t c_lo[32];
+ int32x4_t c_hi[32];
+ int32x4_t d_lo[32];
+ int32x4_t d_hi[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+ b[18] = a[18];
+ b[19] = a[19];
+
+ butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+ b[28] = a[28];
+ b[29] = a[29];
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 3. With extreme values for input this calculation rolls over int16_t.
+ // The sources for b[0] get added multiple times and, through testing, have
+ // been shown to overflow starting here.
+ ADD_S16_S32(b, 0, 7, c, 0);
+ ADD_S16_S32(b, 1, 6, c, 1);
+ ADD_S16_S32(b, 2, 5, c, 2);
+ ADD_S16_S32(b, 3, 4, c, 3);
+ SUB_S16_S32(b, 3, 4, c, 4);
+ SUB_S16_S32(b, 2, 5, c, 5);
+ SUB_S16_S32(b, 1, 6, c, 6);
+ SUB_S16_S32(b, 0, 7, c, 7);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+ BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ ADD_S16_S32(b, 16, 23, c, 16);
+ ADD_S16_S32(b, 17, 22, c, 17);
+ ADD_S16_S32(b, 18, 21, c, 18);
+ ADD_S16_S32(b, 19, 20, c, 19);
+ SUB_S16_S32(b, 19, 20, c, 20);
+ SUB_S16_S32(b, 18, 21, c, 21);
+ SUB_S16_S32(b, 17, 22, c, 22);
+ SUB_S16_S32(b, 16, 23, c, 23);
+ SUB_S16_S32(b, 31, 24, c, 24);
+ SUB_S16_S32(b, 30, 25, c, 25);
+ SUB_S16_S32(b, 29, 26, c, 26);
+ SUB_S16_S32(b, 28, 27, c, 27);
+ ADD_S16_S32(b, 28, 27, c, 28);
+ ADD_S16_S32(b, 29, 26, c, 29);
+ ADD_S16_S32(b, 30, 25, c, 30);
+ ADD_S16_S32(b, 31, 24, c, 31);
+
+ // Stage 4.
+ ADD_S32(c, 0, 3, d, 0);
+ ADD_S32(c, 1, 2, d, 1);
+ SUB_S32(c, 1, 2, d, 2);
+ SUB_S32(c, 0, 3, d, 3);
+
+ PASS_THROUGH(c, d, 4);
+
+ BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+ PASS_THROUGH(c, d, 7);
+
+ ADDW_S16_S32(c, 11, a, 8, d, 8);
+ ADDW_S16_S32(c, 10, a, 9, d, 9);
+ SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+ SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+ SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+ SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+ ADDW_S16_S32(c, 13, b, 14, d, 14);
+ ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 17);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
+
+ PASS_THROUGH(c, d, 22);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 25);
+
+ PASS_THROUGH(c, d, 30);
+ PASS_THROUGH(c, d, 31);
+
+ // Stage 5.
+ BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+ BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
+
+ ADD_S32(d, 4, 5, c, 4);
+ SUB_S32(d, 4, 5, c, 5);
+ SUB_S32(d, 7, 6, c, 6);
+ ADD_S32(d, 7, 6, c, 7);
+
+ PASS_THROUGH(d, c, 8);
+
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
+
+ PASS_THROUGH(d, c, 11);
+ PASS_THROUGH(d, c, 12);
+ PASS_THROUGH(d, c, 15);
+
+ ADD_S32(d, 16, 19, c, 16);
+ ADD_S32(d, 17, 18, c, 17);
+ SUB_S32(d, 17, 18, c, 18);
+ SUB_S32(d, 16, 19, c, 19);
+ SUB_S32(d, 23, 20, c, 20);
+ SUB_S32(d, 22, 21, c, 21);
+ ADD_S32(d, 22, 21, c, 22);
+ ADD_S32(d, 23, 20, c, 23);
+ ADD_S32(d, 24, 27, c, 24);
+ ADD_S32(d, 25, 26, c, 25);
+ SUB_S32(d, 25, 26, c, 26);
+ SUB_S32(d, 24, 27, c, 27);
+ SUB_S32(d, 31, 28, c, 28);
+ SUB_S32(d, 30, 29, c, 29);
+ ADD_S32(d, 30, 29, c, 30);
+ ADD_S32(d, 31, 28, c, 31);
+
+ // Stage 6.
+ PASS_THROUGH(c, d, 0);
+ PASS_THROUGH(c, d, 1);
+ PASS_THROUGH(c, d, 2);
+ PASS_THROUGH(c, d, 3);
+
+ BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
+ BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
+
+ ADD_S32(c, 8, 9, d, 8);
+ SUB_S32(c, 8, 9, d, 9);
+ SUB_S32(c, 11, 10, d, 10);
+ ADD_S32(c, 11, 10, d, 11);
+ ADD_S32(c, 12, 13, d, 12);
+ SUB_S32(c, 12, 13, d, 13);
+ SUB_S32(c, 15, 14, d, 14);
+ ADD_S32(c, 15, 14, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 19);
+ PASS_THROUGH(c, d, 20);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 27);
+ PASS_THROUGH(c, d, 28);
+ PASS_THROUGH(c, d, 31);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
+
+ // Stage 7.
+ PASS_THROUGH(d, c, 0);
+ PASS_THROUGH(d, c, 1);
+ PASS_THROUGH(d, c, 2);
+ PASS_THROUGH(d, c, 3);
+ PASS_THROUGH(d, c, 4);
+ PASS_THROUGH(d, c, 5);
+ PASS_THROUGH(d, c, 6);
+ PASS_THROUGH(d, c, 7);
+
+ BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
+ BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
+
+ ADD_S32(d, 16, 17, c, 16);
+ SUB_S32(d, 16, 17, c, 17);
+ SUB_S32(d, 19, 18, c, 18);
+ ADD_S32(d, 19, 18, c, 19);
+ ADD_S32(d, 20, 21, c, 20);
+ SUB_S32(d, 20, 21, c, 21);
+ SUB_S32(d, 23, 22, c, 22);
+ ADD_S32(d, 23, 22, c, 23);
+ ADD_S32(d, 24, 25, c, 24);
+ SUB_S32(d, 24, 25, c, 25);
+ SUB_S32(d, 27, 26, c, 26);
+ ADD_S32(d, 27, 26, c, 27);
+ ADD_S32(d, 28, 29, c, 28);
+ SUB_S32(d, 28, 29, c, 29);
+ SUB_S32(d, 31, 30, c, 30);
+ ADD_S32(d, 31, 30, c, 31);
+
+ // Final stage.
+ // Roll rounding into this function so we can pass back int16x8.
+
+ out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
+ out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
+
+ out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
+ out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
+ out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
+ out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
+ out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
+
+ out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
+ out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
+ out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
+ out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
+
+ out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
+ out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
+ out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
+ out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
+ out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
+ out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
+ out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
+ out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
+ out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
+ out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
+ out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
+
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
+ out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
+ out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
+
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
+ out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
+ out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
+
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
+ out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
+ out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
+
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
+ out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
+ out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
+
+ BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
+ out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
+ out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
+}
+
+static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+ b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+ b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+ b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+ b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+ b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+ b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+ b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+ b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+ b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+ b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+ b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+ b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+ b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+ b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+ b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+ b[16] = add_round_shift_s16(a[16]);
+ b[17] = add_round_shift_s16(a[17]);
+ b[18] = add_round_shift_s16(a[18]);
+ b[19] = add_round_shift_s16(a[19]);
+
+ butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+ b[20] = add_round_shift_s16(b[20]);
+ b[21] = add_round_shift_s16(b[21]);
+ b[22] = add_round_shift_s16(b[22]);
+ b[23] = add_round_shift_s16(b[23]);
+ b[24] = add_round_shift_s16(b[24]);
+ b[25] = add_round_shift_s16(b[25]);
+ b[26] = add_round_shift_s16(b[26]);
+ b[27] = add_round_shift_s16(b[27]);
+
+ b[28] = add_round_shift_s16(a[28]);
+ b[29] = add_round_shift_s16(a[29]);
+ b[30] = add_round_shift_s16(a[30]);
+ b[31] = add_round_shift_s16(a[31]);
+
+ // Stage 3.
+ a[0] = vaddq_s16(b[0], b[7]);
+ a[1] = vaddq_s16(b[1], b[6]);
+ a[2] = vaddq_s16(b[2], b[5]);
+ a[3] = vaddq_s16(b[3], b[4]);
+
+ a[4] = vsubq_s16(b[3], b[4]);
+ a[5] = vsubq_s16(b[2], b[5]);
+ a[6] = vsubq_s16(b[1], b[6]);
+ a[7] = vsubq_s16(b[0], b[7]);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+ butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[16], b[23]);
+ a[17] = vaddq_s16(b[17], b[22]);
+ a[18] = vaddq_s16(b[18], b[21]);
+ a[19] = vaddq_s16(b[19], b[20]);
+
+ a[20] = vsubq_s16(b[19], b[20]);
+ a[21] = vsubq_s16(b[18], b[21]);
+ a[22] = vsubq_s16(b[17], b[22]);
+ a[23] = vsubq_s16(b[16], b[23]);
+
+ a[24] = vsubq_s16(b[31], b[24]);
+ a[25] = vsubq_s16(b[30], b[25]);
+ a[26] = vsubq_s16(b[29], b[26]);
+ a[27] = vsubq_s16(b[28], b[27]);
+
+ a[28] = vaddq_s16(b[28], b[27]);
+ a[29] = vaddq_s16(b[29], b[26]);
+ a[30] = vaddq_s16(b[30], b[25]);
+ a[31] = vaddq_s16(b[31], b[24]);
+
+ // Stage 4.
+ b[0] = vaddq_s16(a[0], a[3]);
+ b[1] = vaddq_s16(a[1], a[2]);
+ b[2] = vsubq_s16(a[1], a[2]);
+ b[3] = vsubq_s16(a[0], a[3]);
+
+ b[4] = a[4];
+
+ butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+ b[7] = a[7];
+
+ b[8] = vaddq_s16(a[8], a[11]);
+ b[9] = vaddq_s16(a[9], a[10]);
+ b[10] = vsubq_s16(a[9], a[10]);
+ b[11] = vsubq_s16(a[8], a[11]);
+ b[12] = vsubq_s16(a[15], a[12]);
+ b[13] = vsubq_s16(a[14], a[13]);
+ b[14] = vaddq_s16(a[14], a[13]);
+ b[15] = vaddq_s16(a[15], a[12]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+
+ butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
+ butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
+ butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
+ butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
+
+ b[22] = a[22];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[25] = a[25];
+
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+ butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
+
+ a[4] = vaddq_s16(b[4], b[5]);
+ a[5] = vsubq_s16(b[4], b[5]);
+ a[6] = vsubq_s16(b[7], b[6]);
+ a[7] = vaddq_s16(b[7], b[6]);
+
+ a[8] = b[8];
+
+ butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
+ butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
+
+ a[11] = b[11];
+ a[12] = b[12];
+
+ a[15] = b[15];
+
+ a[16] = vaddq_s16(b[19], b[16]);
+ a[17] = vaddq_s16(b[18], b[17]);
+ a[18] = vsubq_s16(b[17], b[18]);
+ a[19] = vsubq_s16(b[16], b[19]);
+ a[20] = vsubq_s16(b[23], b[20]);
+ a[21] = vsubq_s16(b[22], b[21]);
+ a[22] = vaddq_s16(b[21], b[22]);
+ a[23] = vaddq_s16(b[20], b[23]);
+ a[24] = vaddq_s16(b[27], b[24]);
+ a[25] = vaddq_s16(b[26], b[25]);
+ a[26] = vsubq_s16(b[25], b[26]);
+ a[27] = vsubq_s16(b[24], b[27]);
+ a[28] = vsubq_s16(b[31], b[28]);
+ a[29] = vsubq_s16(b[30], b[29]);
+ a[30] = vaddq_s16(b[29], b[30]);
+ a[31] = vaddq_s16(b[28], b[31]);
+
+ // Stage 6.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+
+ butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
+ butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
+
+ b[8] = vaddq_s16(a[8], a[9]);
+ b[9] = vsubq_s16(a[8], a[9]);
+ b[10] = vsubq_s16(a[11], a[10]);
+ b[11] = vaddq_s16(a[11], a[10]);
+ b[12] = vaddq_s16(a[12], a[13]);
+ b[13] = vsubq_s16(a[12], a[13]);
+ b[14] = vsubq_s16(a[15], a[14]);
+ b[15] = vaddq_s16(a[15], a[14]);
+
+ b[16] = a[16];
+ b[19] = a[19];
+ b[20] = a[20];
+ b[23] = a[23];
+ b[24] = a[24];
+ b[27] = a[27];
+ b[28] = a[28];
+ b[31] = a[31];
+
+ butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
+ butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
+
+ butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
+ butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
+
+ // Stage 7.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+ a[4] = b[4];
+ a[5] = b[5];
+ a[6] = b[6];
+ a[7] = b[7];
+
+ butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
+ butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
+ butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
+ butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
+
+ a[16] = vaddq_s16(b[16], b[17]);
+ a[17] = vsubq_s16(b[16], b[17]);
+ a[18] = vsubq_s16(b[19], b[18]);
+ a[19] = vaddq_s16(b[19], b[18]);
+ a[20] = vaddq_s16(b[20], b[21]);
+ a[21] = vsubq_s16(b[20], b[21]);
+ a[22] = vsubq_s16(b[23], b[22]);
+ a[23] = vaddq_s16(b[23], b[22]);
+ a[24] = vaddq_s16(b[24], b[25]);
+ a[25] = vsubq_s16(b[24], b[25]);
+ a[26] = vsubq_s16(b[27], b[26]);
+ a[27] = vaddq_s16(b[27], b[26]);
+ a[28] = vaddq_s16(b[28], b[29]);
+ a[29] = vsubq_s16(b[28], b[29]);
+ a[30] = vsubq_s16(b[31], b[30]);
+ a[31] = vaddq_s16(b[31], b[30]);
+
+ // Final stage.
+ out[0] = a[0];
+ out[16] = a[1];
+ out[8] = a[2];
+ out[24] = a[3];
+ out[4] = a[4];
+ out[20] = a[5];
+ out[12] = a[6];
+ out[28] = a[7];
+ out[2] = a[8];
+ out[18] = a[9];
+ out[10] = a[10];
+ out[26] = a[11];
+ out[6] = a[12];
+ out[22] = a[13];
+ out[14] = a[14];
+ out[30] = a[15];
+
+ butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
+ butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
+ &out[15]);
+ butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
+ butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
+ butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
+ butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
+ &out[11]);
+ butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
+ &out[19]);
+ butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Store 32 32x4 vectors, assuming stride == 32.
+static INLINE void store32x32_s32(
+ tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
+ const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
+ const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
+ const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
+ int i;
+ for (i = 0; i < 32; i++) {
+ vst1q_s32(a, l1[i]);
+ vst1q_s32(a + 4, r1[i]);
+ vst1q_s32(a + 8, l2[i]);
+ vst1q_s32(a + 12, r2[i]);
+ vst1q_s32(a + 16, l3[i]);
+ vst1q_s32(a + 20, r3[i]);
+ vst1q_s32(a + 24, l4[i]);
+ vst1q_s32(a + 28, r4[i]);
+ a += 32;
+ }
+}
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
+ int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+ left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+ left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+ left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+ left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+ left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+ left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+ left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+ left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+ left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
+ left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
+ left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
+ left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
+ left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
+ left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
+ left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
+ left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
+ left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
+ left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
+ left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
+ left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
+ left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
+ left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
+ left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
+ left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
+
+ right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+ right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+ right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+ right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+ right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+ right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+ right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+ right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+ right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+ right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
+ right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
+ right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
+ right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
+ right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
+ right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
+ right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
+ right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
+ right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
+ right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
+ right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
+ right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
+ right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
+ right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
+ right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
+ right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
+ int32x4_t *a_right /*[32]*/,
+ int32x4_t *b_left /*[32]*/,
+ int32x4_t *b_right /*[32]*/) {
+ // Stage 1. Done as part of the load for the first pass.
+ b_left[0] = vaddq_s32(a_left[0], a_left[31]);
+ b_left[1] = vaddq_s32(a_left[1], a_left[30]);
+ b_left[2] = vaddq_s32(a_left[2], a_left[29]);
+ b_left[3] = vaddq_s32(a_left[3], a_left[28]);
+ b_left[4] = vaddq_s32(a_left[4], a_left[27]);
+ b_left[5] = vaddq_s32(a_left[5], a_left[26]);
+ b_left[6] = vaddq_s32(a_left[6], a_left[25]);
+ b_left[7] = vaddq_s32(a_left[7], a_left[24]);
+ b_left[8] = vaddq_s32(a_left[8], a_left[23]);
+ b_left[9] = vaddq_s32(a_left[9], a_left[22]);
+ b_left[10] = vaddq_s32(a_left[10], a_left[21]);
+ b_left[11] = vaddq_s32(a_left[11], a_left[20]);
+ b_left[12] = vaddq_s32(a_left[12], a_left[19]);
+ b_left[13] = vaddq_s32(a_left[13], a_left[18]);
+ b_left[14] = vaddq_s32(a_left[14], a_left[17]);
+ b_left[15] = vaddq_s32(a_left[15], a_left[16]);
+
+ b_right[0] = vaddq_s32(a_right[0], a_right[31]);
+ b_right[1] = vaddq_s32(a_right[1], a_right[30]);
+ b_right[2] = vaddq_s32(a_right[2], a_right[29]);
+ b_right[3] = vaddq_s32(a_right[3], a_right[28]);
+ b_right[4] = vaddq_s32(a_right[4], a_right[27]);
+ b_right[5] = vaddq_s32(a_right[5], a_right[26]);
+ b_right[6] = vaddq_s32(a_right[6], a_right[25]);
+ b_right[7] = vaddq_s32(a_right[7], a_right[24]);
+ b_right[8] = vaddq_s32(a_right[8], a_right[23]);
+ b_right[9] = vaddq_s32(a_right[9], a_right[22]);
+ b_right[10] = vaddq_s32(a_right[10], a_right[21]);
+ b_right[11] = vaddq_s32(a_right[11], a_right[20]);
+ b_right[12] = vaddq_s32(a_right[12], a_right[19]);
+ b_right[13] = vaddq_s32(a_right[13], a_right[18]);
+ b_right[14] = vaddq_s32(a_right[14], a_right[17]);
+ b_right[15] = vaddq_s32(a_right[15], a_right[16]);
+
+ b_left[16] = vsubq_s32(a_left[15], a_left[16]);
+ b_left[17] = vsubq_s32(a_left[14], a_left[17]);
+ b_left[18] = vsubq_s32(a_left[13], a_left[18]);
+ b_left[19] = vsubq_s32(a_left[12], a_left[19]);
+ b_left[20] = vsubq_s32(a_left[11], a_left[20]);
+ b_left[21] = vsubq_s32(a_left[10], a_left[21]);
+ b_left[22] = vsubq_s32(a_left[9], a_left[22]);
+ b_left[23] = vsubq_s32(a_left[8], a_left[23]);
+ b_left[24] = vsubq_s32(a_left[7], a_left[24]);
+ b_left[25] = vsubq_s32(a_left[6], a_left[25]);
+ b_left[26] = vsubq_s32(a_left[5], a_left[26]);
+ b_left[27] = vsubq_s32(a_left[4], a_left[27]);
+ b_left[28] = vsubq_s32(a_left[3], a_left[28]);
+ b_left[29] = vsubq_s32(a_left[2], a_left[29]);
+ b_left[30] = vsubq_s32(a_left[1], a_left[30]);
+ b_left[31] = vsubq_s32(a_left[0], a_left[31]);
+
+ b_right[16] = vsubq_s32(a_right[15], a_right[16]);
+ b_right[17] = vsubq_s32(a_right[14], a_right[17]);
+ b_right[18] = vsubq_s32(a_right[13], a_right[18]);
+ b_right[19] = vsubq_s32(a_right[12], a_right[19]);
+ b_right[20] = vsubq_s32(a_right[11], a_right[20]);
+ b_right[21] = vsubq_s32(a_right[10], a_right[21]);
+ b_right[22] = vsubq_s32(a_right[9], a_right[22]);
+ b_right[23] = vsubq_s32(a_right[8], a_right[23]);
+ b_right[24] = vsubq_s32(a_right[7], a_right[24]);
+ b_right[25] = vsubq_s32(a_right[6], a_right[25]);
+ b_right[26] = vsubq_s32(a_right[5], a_right[26]);
+ b_right[27] = vsubq_s32(a_right[4], a_right[27]);
+ b_right[28] = vsubq_s32(a_right[3], a_right[28]);
+ b_right[29] = vsubq_s32(a_right[2], a_right[29]);
+ b_right[30] = vsubq_s32(a_right[1], a_right[30]);
+ b_right[31] = vsubq_s32(a_right[0], a_right[31]);
+}
+
+static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+ left[0] = add_round_shift_s32(left[0]);
+ left[1] = add_round_shift_s32(left[1]);
+ left[2] = add_round_shift_s32(left[2]);
+ left[3] = add_round_shift_s32(left[3]);
+ left[4] = add_round_shift_s32(left[4]);
+ left[5] = add_round_shift_s32(left[5]);
+ left[6] = add_round_shift_s32(left[6]);
+ left[7] = add_round_shift_s32(left[7]);
+ left[8] = add_round_shift_s32(left[8]);
+ left[9] = add_round_shift_s32(left[9]);
+ left[10] = add_round_shift_s32(left[10]);
+ left[11] = add_round_shift_s32(left[11]);
+ left[12] = add_round_shift_s32(left[12]);
+ left[13] = add_round_shift_s32(left[13]);
+ left[14] = add_round_shift_s32(left[14]);
+ left[15] = add_round_shift_s32(left[15]);
+ left[16] = add_round_shift_s32(left[16]);
+ left[17] = add_round_shift_s32(left[17]);
+ left[18] = add_round_shift_s32(left[18]);
+ left[19] = add_round_shift_s32(left[19]);
+ left[20] = add_round_shift_s32(left[20]);
+ left[21] = add_round_shift_s32(left[21]);
+ left[22] = add_round_shift_s32(left[22]);
+ left[23] = add_round_shift_s32(left[23]);
+ left[24] = add_round_shift_s32(left[24]);
+ left[25] = add_round_shift_s32(left[25]);
+ left[26] = add_round_shift_s32(left[26]);
+ left[27] = add_round_shift_s32(left[27]);
+ left[28] = add_round_shift_s32(left[28]);
+ left[29] = add_round_shift_s32(left[29]);
+ left[30] = add_round_shift_s32(left[30]);
+ left[31] = add_round_shift_s32(left[31]);
+
+ right[0] = add_round_shift_s32(right[0]);
+ right[1] = add_round_shift_s32(right[1]);
+ right[2] = add_round_shift_s32(right[2]);
+ right[3] = add_round_shift_s32(right[3]);
+ right[4] = add_round_shift_s32(right[4]);
+ right[5] = add_round_shift_s32(right[5]);
+ right[6] = add_round_shift_s32(right[6]);
+ right[7] = add_round_shift_s32(right[7]);
+ right[8] = add_round_shift_s32(right[8]);
+ right[9] = add_round_shift_s32(right[9]);
+ right[10] = add_round_shift_s32(right[10]);
+ right[11] = add_round_shift_s32(right[11]);
+ right[12] = add_round_shift_s32(right[12]);
+ right[13] = add_round_shift_s32(right[13]);
+ right[14] = add_round_shift_s32(right[14]);
+ right[15] = add_round_shift_s32(right[15]);
+ right[16] = add_round_shift_s32(right[16]);
+ right[17] = add_round_shift_s32(right[17]);
+ right[18] = add_round_shift_s32(right[18]);
+ right[19] = add_round_shift_s32(right[19]);
+ right[20] = add_round_shift_s32(right[20]);
+ right[21] = add_round_shift_s32(right[21]);
+ right[22] = add_round_shift_s32(right[22]);
+ right[23] = add_round_shift_s32(right[23]);
+ right[24] = add_round_shift_s32(right[24]);
+ right[25] = add_round_shift_s32(right[25]);
+ right[26] = add_round_shift_s32(right[26]);
+ right[27] = add_round_shift_s32(right[27]);
+ right[28] = add_round_shift_s32(right[28]);
+ right[29] = add_round_shift_s32(right[29]);
+ right[30] = add_round_shift_s32(right[30]);
+ right[31] = add_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
+ int32x4_t *right /* [32] */) {
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+ left[0] = sub_round_shift_s32(left[0]);
+ left[1] = sub_round_shift_s32(left[1]);
+ left[2] = sub_round_shift_s32(left[2]);
+ left[3] = sub_round_shift_s32(left[3]);
+ left[4] = sub_round_shift_s32(left[4]);
+ left[5] = sub_round_shift_s32(left[5]);
+ left[6] = sub_round_shift_s32(left[6]);
+ left[7] = sub_round_shift_s32(left[7]);
+ left[8] = sub_round_shift_s32(left[8]);
+ left[9] = sub_round_shift_s32(left[9]);
+ left[10] = sub_round_shift_s32(left[10]);
+ left[11] = sub_round_shift_s32(left[11]);
+ left[12] = sub_round_shift_s32(left[12]);
+ left[13] = sub_round_shift_s32(left[13]);
+ left[14] = sub_round_shift_s32(left[14]);
+ left[15] = sub_round_shift_s32(left[15]);
+ left[16] = sub_round_shift_s32(left[16]);
+ left[17] = sub_round_shift_s32(left[17]);
+ left[18] = sub_round_shift_s32(left[18]);
+ left[19] = sub_round_shift_s32(left[19]);
+ left[20] = sub_round_shift_s32(left[20]);
+ left[21] = sub_round_shift_s32(left[21]);
+ left[22] = sub_round_shift_s32(left[22]);
+ left[23] = sub_round_shift_s32(left[23]);
+ left[24] = sub_round_shift_s32(left[24]);
+ left[25] = sub_round_shift_s32(left[25]);
+ left[26] = sub_round_shift_s32(left[26]);
+ left[27] = sub_round_shift_s32(left[27]);
+ left[28] = sub_round_shift_s32(left[28]);
+ left[29] = sub_round_shift_s32(left[29]);
+ left[30] = sub_round_shift_s32(left[30]);
+ left[31] = sub_round_shift_s32(left[31]);
+
+ right[0] = sub_round_shift_s32(right[0]);
+ right[1] = sub_round_shift_s32(right[1]);
+ right[2] = sub_round_shift_s32(right[2]);
+ right[3] = sub_round_shift_s32(right[3]);
+ right[4] = sub_round_shift_s32(right[4]);
+ right[5] = sub_round_shift_s32(right[5]);
+ right[6] = sub_round_shift_s32(right[6]);
+ right[7] = sub_round_shift_s32(right[7]);
+ right[8] = sub_round_shift_s32(right[8]);
+ right[9] = sub_round_shift_s32(right[9]);
+ right[10] = sub_round_shift_s32(right[10]);
+ right[11] = sub_round_shift_s32(right[11]);
+ right[12] = sub_round_shift_s32(right[12]);
+ right[13] = sub_round_shift_s32(right[13]);
+ right[14] = sub_round_shift_s32(right[14]);
+ right[15] = sub_round_shift_s32(right[15]);
+ right[16] = sub_round_shift_s32(right[16]);
+ right[17] = sub_round_shift_s32(right[17]);
+ right[18] = sub_round_shift_s32(right[18]);
+ right[19] = sub_round_shift_s32(right[19]);
+ right[20] = sub_round_shift_s32(right[20]);
+ right[21] = sub_round_shift_s32(right[21]);
+ right[22] = sub_round_shift_s32(right[22]);
+ right[23] = sub_round_shift_s32(right[23]);
+ right[24] = sub_round_shift_s32(right[24]);
+ right[25] = sub_round_shift_s32(right[25]);
+ right[26] = sub_round_shift_s32(right[26]);
+ right[27] = sub_round_shift_s32(right[27]);
+ right[28] = sub_round_shift_s32(right[28]);
+ right[29] = sub_round_shift_s32(right[29]);
+ right[30] = sub_round_shift_s32(right[30]);
+ right[31] = sub_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ al[0] = vaddq_s32(left[0], left[15]);
+ ar[0] = vaddq_s32(right[0], right[15]);
+ al[1] = vaddq_s32(left[1], left[14]);
+ ar[1] = vaddq_s32(right[1], right[14]);
+ al[2] = vaddq_s32(left[2], left[13]);
+ ar[2] = vaddq_s32(right[2], right[13]);
+ al[3] = vaddq_s32(left[3], left[12]);
+ ar[3] = vaddq_s32(right[3], right[12]);
+ al[4] = vaddq_s32(left[4], left[11]);
+ ar[4] = vaddq_s32(right[4], right[11]);
+ al[5] = vaddq_s32(left[5], left[10]);
+ ar[5] = vaddq_s32(right[5], right[10]);
+ al[6] = vaddq_s32(left[6], left[9]);
+ ar[6] = vaddq_s32(right[6], right[9]);
+ al[7] = vaddq_s32(left[7], left[8]);
+ ar[7] = vaddq_s32(right[7], right[8]);
+
+ al[8] = vsubq_s32(left[7], left[8]);
+ ar[8] = vsubq_s32(right[7], right[8]);
+ al[9] = vsubq_s32(left[6], left[9]);
+ ar[9] = vsubq_s32(right[6], right[9]);
+ al[10] = vsubq_s32(left[5], left[10]);
+ ar[10] = vsubq_s32(right[5], right[10]);
+ al[11] = vsubq_s32(left[4], left[11]);
+ ar[11] = vsubq_s32(right[4], right[11]);
+ al[12] = vsubq_s32(left[3], left[12]);
+ ar[12] = vsubq_s32(right[3], right[12]);
+ al[13] = vsubq_s32(left[2], left[13]);
+ ar[13] = vsubq_s32(right[2], right[13]);
+ al[14] = vsubq_s32(left[1], left[14]);
+ ar[14] = vsubq_s32(right[1], right[14]);
+ al[15] = vsubq_s32(left[0], left[15]);
+ ar[15] = vsubq_s32(right[0], right[15]);
+
+ al[16] = left[16];
+ ar[16] = right[16];
+ al[17] = left[17];
+ ar[17] = right[17];
+ al[18] = left[18];
+ ar[18] = right[18];
+ al[19] = left[19];
+ ar[19] = right[19];
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[28] = left[28];
+ ar[28] = right[28];
+ al[29] = left[29];
+ ar[29] = right[29];
+ al[30] = left[30];
+ ar[30] = right[30];
+ al[31] = left[31];
+ ar[31] = right[31];
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(left[16], al[23]);
+ br[16] = vaddq_s32(right[16], ar[23]);
+ bl[17] = vaddq_s32(left[17], al[22]);
+ br[17] = vaddq_s32(right[17], ar[22]);
+ bl[18] = vaddq_s32(left[18], al[21]);
+ br[18] = vaddq_s32(right[18], ar[21]);
+ bl[19] = vaddq_s32(left[19], al[20]);
+ br[19] = vaddq_s32(right[19], ar[20]);
+
+ bl[20] = vsubq_s32(left[19], al[20]);
+ br[20] = vsubq_s32(right[19], ar[20]);
+ bl[21] = vsubq_s32(left[18], al[21]);
+ br[21] = vsubq_s32(right[18], ar[21]);
+ bl[22] = vsubq_s32(left[17], al[22]);
+ br[22] = vsubq_s32(right[17], ar[22]);
+ bl[23] = vsubq_s32(left[16], al[23]);
+ br[23] = vsubq_s32(right[16], ar[23]);
+
+ bl[24] = vsubq_s32(left[31], al[24]);
+ br[24] = vsubq_s32(right[31], ar[24]);
+ bl[25] = vsubq_s32(left[30], al[25]);
+ br[25] = vsubq_s32(right[30], ar[25]);
+ bl[26] = vsubq_s32(left[29], al[26]);
+ br[26] = vsubq_s32(right[29], ar[26]);
+ bl[27] = vsubq_s32(left[28], al[27]);
+ br[27] = vsubq_s32(right[28], ar[27]);
+
+ bl[28] = vaddq_s32(left[28], al[27]);
+ br[28] = vaddq_s32(right[28], ar[27]);
+ bl[29] = vaddq_s32(left[29], al[26]);
+ br[29] = vaddq_s32(right[29], ar[26]);
+ bl[30] = vaddq_s32(left[30], al[25]);
+ br[30] = vaddq_s32(right[30], ar[25]);
+ bl[31] = vaddq_s32(left[31], al[24]);
+ br[31] = vaddq_s32(right[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18],
+ &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19],
+ &ar[19]);
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+ cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+ &al[20], &ar[20]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+ cospi_24_64, &bl[2], &br[2], &bl[3],
+ &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+ cospi_24_64, &bl[14], &br[14], &bl[9],
+ &br[9]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+ &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+ cospi_28_64, &al[4], &ar[4], &al[7],
+ &ar[7]);
+ butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+ cospi_12_64, &al[5], &ar[5], &al[6],
+ &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17],
+ &ar[17]);
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+ cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+ &al[18], &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_20_64, cospi_12_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_12_64, -cospi_20_64, &al[25],
+ &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+ cospi_30_64, &bl[8], &br[8], &bl[15],
+ &br[15]);
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14],
+ &br[14]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_10_64, cospi_22_64, &bl[10], &br[10],
+ &bl[13], &br[13]);
+ butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+ cospi_26_64, cospi_6_64, &bl[11], &br[11],
+ &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31],
+ &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+ cospi_17_64, cospi_15_64, &al[17], &ar[17],
+ &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23],
+ &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+ cospi_25_64, cospi_7_64, &al[25], &ar[25],
+ &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27],
+ &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_21_64, cospi_11_64, &al[21], &ar[21],
+ &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_13_64, cospi_19_64, &al[13], &ar[13],
+ &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+ cospi_29_64, cospi_3_64, &al[29], &ar[29],
+ &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ al[0] = vaddq_s32(left[0], left[15]);
+ ar[0] = vaddq_s32(right[0], right[15]);
+ al[1] = vaddq_s32(left[1], left[14]);
+ ar[1] = vaddq_s32(right[1], right[14]);
+ al[2] = vaddq_s32(left[2], left[13]);
+ ar[2] = vaddq_s32(right[2], right[13]);
+ al[3] = vaddq_s32(left[3], left[12]);
+ ar[3] = vaddq_s32(right[3], right[12]);
+ al[4] = vaddq_s32(left[4], left[11]);
+ ar[4] = vaddq_s32(right[4], right[11]);
+ al[5] = vaddq_s32(left[5], left[10]);
+ ar[5] = vaddq_s32(right[5], right[10]);
+ al[6] = vaddq_s32(left[6], left[9]);
+ ar[6] = vaddq_s32(right[6], right[9]);
+ al[7] = vaddq_s32(left[7], left[8]);
+ ar[7] = vaddq_s32(right[7], right[8]);
+
+ al[8] = vsubq_s32(left[7], left[8]);
+ ar[8] = vsubq_s32(right[7], right[8]);
+ al[9] = vsubq_s32(left[6], left[9]);
+ ar[9] = vsubq_s32(right[6], right[9]);
+ al[10] = vsubq_s32(left[5], left[10]);
+ ar[10] = vsubq_s32(right[5], right[10]);
+ al[11] = vsubq_s32(left[4], left[11]);
+ ar[11] = vsubq_s32(right[4], right[11]);
+ al[12] = vsubq_s32(left[3], left[12]);
+ ar[12] = vsubq_s32(right[3], right[12]);
+ al[13] = vsubq_s32(left[2], left[13]);
+ ar[13] = vsubq_s32(right[2], right[13]);
+ al[14] = vsubq_s32(left[1], left[14]);
+ ar[14] = vsubq_s32(right[1], right[14]);
+ al[15] = vsubq_s32(left[0], left[15]);
+ ar[15] = vsubq_s32(right[0], right[15]);
+
+ al[16] = left[16];
+ ar[16] = right[16];
+ al[17] = left[17];
+ ar[17] = right[17];
+ al[18] = left[18];
+ ar[18] = right[18];
+ al[19] = left[19];
+ ar[19] = right[19];
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[28] = left[28];
+ ar[28] = right[28];
+ al[29] = left[29];
+ ar[29] = right[29];
+ al[30] = left[30];
+ ar[30] = right[30];
+ al[31] = left[31];
+ ar[31] = right[31];
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(left[16], al[23]);
+ br[16] = vaddq_s32(right[16], ar[23]);
+ bl[17] = vaddq_s32(left[17], al[22]);
+ br[17] = vaddq_s32(right[17], ar[22]);
+ bl[18] = vaddq_s32(left[18], al[21]);
+ br[18] = vaddq_s32(right[18], ar[21]);
+ bl[19] = vaddq_s32(left[19], al[20]);
+ br[19] = vaddq_s32(right[19], ar[20]);
+
+ bl[20] = vsubq_s32(left[19], al[20]);
+ br[20] = vsubq_s32(right[19], ar[20]);
+ bl[21] = vsubq_s32(left[18], al[21]);
+ br[21] = vsubq_s32(right[18], ar[21]);
+ bl[22] = vsubq_s32(left[17], al[22]);
+ br[22] = vsubq_s32(right[17], ar[22]);
+ bl[23] = vsubq_s32(left[16], al[23]);
+ br[23] = vsubq_s32(right[16], ar[23]);
+
+ bl[24] = vsubq_s32(left[31], al[24]);
+ br[24] = vsubq_s32(right[31], ar[24]);
+ bl[25] = vsubq_s32(left[30], al[25]);
+ br[25] = vsubq_s32(right[30], ar[25]);
+ bl[26] = vsubq_s32(left[29], al[26]);
+ br[26] = vsubq_s32(right[29], ar[26]);
+ bl[27] = vsubq_s32(left[28], al[27]);
+ br[27] = vsubq_s32(right[28], ar[27]);
+
+ bl[28] = vaddq_s32(left[28], al[27]);
+ br[28] = vaddq_s32(right[28], ar[27]);
+ bl[29] = vaddq_s32(left[29], al[26]);
+ br[29] = vaddq_s32(right[29], ar[26]);
+ bl[30] = vaddq_s32(left[30], al[25]);
+ br[30] = vaddq_s32(right[30], ar[25]);
+ bl[31] = vaddq_s32(left[31], al[24]);
+ br[31] = vaddq_s32(right[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18],
+ &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19],
+ &ar[19]);
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+ cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+ &al[20], &ar[20]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+ cospi_24_64, &bl[2], &br[2], &bl[3],
+ &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+ cospi_24_64, &bl[14], &br[14], &bl[9],
+ &br[9]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+ &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+ cospi_28_64, &al[4], &ar[4], &al[7],
+ &ar[7]);
+ butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+ cospi_12_64, &al[5], &ar[5], &al[6],
+ &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17],
+ &ar[17]);
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+ cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+ &al[18], &ar[18]);
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_20_64, cospi_12_64, &al[26], &ar[26],
+ &al[21], &ar[21]);
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_12_64, -cospi_20_64, &al[25],
+ &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+ cospi_30_64, &bl[8], &br[8], &bl[15],
+ &br[15]);
+ butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14],
+ &br[14]);
+ butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+ cospi_10_64, cospi_22_64, &bl[10], &br[10],
+ &bl[13], &br[13]);
+ butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+ cospi_26_64, cospi_6_64, &bl[11], &br[11],
+ &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31],
+ &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+ cospi_17_64, cospi_15_64, &al[17], &ar[17],
+ &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23],
+ &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+ cospi_25_64, cospi_7_64, &al[25], &ar[25],
+ &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27],
+ &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+ cospi_21_64, cospi_11_64, &al[21], &ar[21],
+ &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+ cospi_13_64, cospi_19_64, &al[13], &ar[13],
+ &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+ cospi_29_64, cospi_3_64, &al[29], &ar[29],
+ &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
+ int32x4_t *right /*32*/) {
+ int32x4_t al[32], ar[32];
+ int32x4_t bl[32], br[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // For the "rd" version, all the values are rounded down after stage 2 to keep
+ // the values in 16 bits.
+ al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
+ ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
+ al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
+ ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
+ al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
+ ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
+ al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
+ ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
+ al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
+ ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
+ al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
+ ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
+ al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
+ ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
+ al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
+ ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
+
+ al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
+ ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
+ al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
+ ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
+ al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
+ ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
+ al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
+ ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
+ al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
+ ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
+ al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
+ ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
+ al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
+ ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
+ al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
+ ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
+
+ al[16] = add_round_shift_s32(left[16]);
+ ar[16] = add_round_shift_s32(right[16]);
+ al[17] = add_round_shift_s32(left[17]);
+ ar[17] = add_round_shift_s32(right[17]);
+ al[18] = add_round_shift_s32(left[18]);
+ ar[18] = add_round_shift_s32(right[18]);
+ al[19] = add_round_shift_s32(left[19]);
+ ar[19] = add_round_shift_s32(right[19]);
+
+ butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+ cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+ cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+ cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+ butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+ cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+ al[20] = add_round_shift_s32(al[20]);
+ ar[20] = add_round_shift_s32(ar[20]);
+ al[21] = add_round_shift_s32(al[21]);
+ ar[21] = add_round_shift_s32(ar[21]);
+ al[22] = add_round_shift_s32(al[22]);
+ ar[22] = add_round_shift_s32(ar[22]);
+ al[23] = add_round_shift_s32(al[23]);
+ ar[23] = add_round_shift_s32(ar[23]);
+ al[24] = add_round_shift_s32(al[24]);
+ ar[24] = add_round_shift_s32(ar[24]);
+ al[25] = add_round_shift_s32(al[25]);
+ ar[25] = add_round_shift_s32(ar[25]);
+ al[26] = add_round_shift_s32(al[26]);
+ ar[26] = add_round_shift_s32(ar[26]);
+ al[27] = add_round_shift_s32(al[27]);
+ ar[27] = add_round_shift_s32(ar[27]);
+
+ al[28] = add_round_shift_s32(left[28]);
+ ar[28] = add_round_shift_s32(right[28]);
+ al[29] = add_round_shift_s32(left[29]);
+ ar[29] = add_round_shift_s32(right[29]);
+ al[30] = add_round_shift_s32(left[30]);
+ ar[30] = add_round_shift_s32(right[30]);
+ al[31] = add_round_shift_s32(left[31]);
+ ar[31] = add_round_shift_s32(right[31]);
+
+ // Stage 3.
+ bl[0] = vaddq_s32(al[0], al[7]);
+ br[0] = vaddq_s32(ar[0], ar[7]);
+ bl[1] = vaddq_s32(al[1], al[6]);
+ br[1] = vaddq_s32(ar[1], ar[6]);
+ bl[2] = vaddq_s32(al[2], al[5]);
+ br[2] = vaddq_s32(ar[2], ar[5]);
+ bl[3] = vaddq_s32(al[3], al[4]);
+ br[3] = vaddq_s32(ar[3], ar[4]);
+
+ bl[4] = vsubq_s32(al[3], al[4]);
+ br[4] = vsubq_s32(ar[3], ar[4]);
+ bl[5] = vsubq_s32(al[2], al[5]);
+ br[5] = vsubq_s32(ar[2], ar[5]);
+ bl[6] = vsubq_s32(al[1], al[6]);
+ br[6] = vsubq_s32(ar[1], ar[6]);
+ bl[7] = vsubq_s32(al[0], al[7]);
+ br[7] = vsubq_s32(ar[0], ar[7]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+ bl[9] = al[9];
+ br[9] = ar[9];
+
+ butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+ &bl[13], &br[13], &bl[10], &br[10]);
+ butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+ &bl[12], &br[12], &bl[11], &br[11]);
+
+ bl[14] = al[14];
+ br[14] = ar[14];
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[16], al[23]);
+ br[16] = vaddq_s32(ar[16], ar[23]);
+ bl[17] = vaddq_s32(al[17], al[22]);
+ br[17] = vaddq_s32(ar[17], ar[22]);
+ bl[18] = vaddq_s32(al[18], al[21]);
+ br[18] = vaddq_s32(ar[18], ar[21]);
+ bl[19] = vaddq_s32(al[19], al[20]);
+ br[19] = vaddq_s32(ar[19], ar[20]);
+
+ bl[20] = vsubq_s32(al[19], al[20]);
+ br[20] = vsubq_s32(ar[19], ar[20]);
+ bl[21] = vsubq_s32(al[18], al[21]);
+ br[21] = vsubq_s32(ar[18], ar[21]);
+ bl[22] = vsubq_s32(al[17], al[22]);
+ br[22] = vsubq_s32(ar[17], ar[22]);
+ bl[23] = vsubq_s32(al[16], al[23]);
+ br[23] = vsubq_s32(ar[16], ar[23]);
+
+ bl[24] = vsubq_s32(al[31], al[24]);
+ br[24] = vsubq_s32(ar[31], ar[24]);
+ bl[25] = vsubq_s32(al[30], al[25]);
+ br[25] = vsubq_s32(ar[30], ar[25]);
+ bl[26] = vsubq_s32(al[29], al[26]);
+ br[26] = vsubq_s32(ar[29], ar[26]);
+ bl[27] = vsubq_s32(al[28], al[27]);
+ br[27] = vsubq_s32(ar[28], ar[27]);
+
+ bl[28] = vaddq_s32(al[28], al[27]);
+ br[28] = vaddq_s32(ar[28], ar[27]);
+ bl[29] = vaddq_s32(al[29], al[26]);
+ br[29] = vaddq_s32(ar[29], ar[26]);
+ bl[30] = vaddq_s32(al[30], al[25]);
+ br[30] = vaddq_s32(ar[30], ar[25]);
+ bl[31] = vaddq_s32(al[31], al[24]);
+ br[31] = vaddq_s32(ar[31], ar[24]);
+
+ // Stage 4.
+ al[0] = vaddq_s32(bl[0], bl[3]);
+ ar[0] = vaddq_s32(br[0], br[3]);
+ al[1] = vaddq_s32(bl[1], bl[2]);
+ ar[1] = vaddq_s32(br[1], br[2]);
+ al[2] = vsubq_s32(bl[1], bl[2]);
+ ar[2] = vsubq_s32(br[1], br[2]);
+ al[3] = vsubq_s32(bl[0], bl[3]);
+ ar[3] = vsubq_s32(br[0], br[3]);
+
+ al[4] = bl[4];
+ ar[4] = br[4];
+
+ butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+ &ar[6], &al[5], &ar[5]);
+
+ al[7] = bl[7];
+ ar[7] = br[7];
+
+ al[8] = vaddq_s32(bl[8], bl[11]);
+ ar[8] = vaddq_s32(br[8], br[11]);
+ al[9] = vaddq_s32(bl[9], bl[10]);
+ ar[9] = vaddq_s32(br[9], br[10]);
+ al[10] = vsubq_s32(bl[9], bl[10]);
+ ar[10] = vsubq_s32(br[9], br[10]);
+ al[11] = vsubq_s32(bl[8], bl[11]);
+ ar[11] = vsubq_s32(br[8], br[11]);
+ al[12] = vsubq_s32(bl[15], bl[12]);
+ ar[12] = vsubq_s32(br[15], br[12]);
+ al[13] = vsubq_s32(bl[14], bl[13]);
+ ar[13] = vsubq_s32(br[14], br[13]);
+ al[14] = vaddq_s32(bl[14], bl[13]);
+ ar[14] = vaddq_s32(br[14], br[13]);
+ al[15] = vaddq_s32(bl[15], bl[12]);
+ ar[15] = vaddq_s32(br[15], br[12]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[17] = bl[17];
+ ar[17] = br[17];
+
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
+ cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
+ butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
+ cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
+ butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
+ -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
+ -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
+
+ al[22] = bl[22];
+ ar[22] = br[22];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[25] = bl[25];
+ ar[25] = br[25];
+
+ al[30] = bl[30];
+ ar[30] = br[30];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ // Stage 5.
+ butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+ &br[0], &bl[1], &br[1]);
+ butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
+ &bl[2], &br[2], &bl[3], &br[3]);
+
+ bl[4] = vaddq_s32(al[4], al[5]);
+ br[4] = vaddq_s32(ar[4], ar[5]);
+ bl[5] = vsubq_s32(al[4], al[5]);
+ br[5] = vsubq_s32(ar[4], ar[5]);
+ bl[6] = vsubq_s32(al[7], al[6]);
+ br[6] = vsubq_s32(ar[7], ar[6]);
+ bl[7] = vaddq_s32(al[7], al[6]);
+ br[7] = vaddq_s32(ar[7], ar[6]);
+
+ bl[8] = al[8];
+ br[8] = ar[8];
+
+ butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
+ &bl[14], &br[14], &bl[9], &br[9]);
+ butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
+ -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
+
+ bl[11] = al[11];
+ br[11] = ar[11];
+ bl[12] = al[12];
+ br[12] = ar[12];
+
+ bl[15] = al[15];
+ br[15] = ar[15];
+
+ bl[16] = vaddq_s32(al[19], al[16]);
+ br[16] = vaddq_s32(ar[19], ar[16]);
+ bl[17] = vaddq_s32(al[18], al[17]);
+ br[17] = vaddq_s32(ar[18], ar[17]);
+ bl[18] = vsubq_s32(al[17], al[18]);
+ br[18] = vsubq_s32(ar[17], ar[18]);
+ bl[19] = vsubq_s32(al[16], al[19]);
+ br[19] = vsubq_s32(ar[16], ar[19]);
+ bl[20] = vsubq_s32(al[23], al[20]);
+ br[20] = vsubq_s32(ar[23], ar[20]);
+ bl[21] = vsubq_s32(al[22], al[21]);
+ br[21] = vsubq_s32(ar[22], ar[21]);
+ bl[22] = vaddq_s32(al[21], al[22]);
+ br[22] = vaddq_s32(ar[21], ar[22]);
+ bl[23] = vaddq_s32(al[20], al[23]);
+ br[23] = vaddq_s32(ar[20], ar[23]);
+ bl[24] = vaddq_s32(al[27], al[24]);
+ br[24] = vaddq_s32(ar[27], ar[24]);
+ bl[25] = vaddq_s32(al[26], al[25]);
+ br[25] = vaddq_s32(ar[26], ar[25]);
+ bl[26] = vsubq_s32(al[25], al[26]);
+ br[26] = vsubq_s32(ar[25], ar[26]);
+ bl[27] = vsubq_s32(al[24], al[27]);
+ br[27] = vsubq_s32(ar[24], ar[27]);
+ bl[28] = vsubq_s32(al[31], al[28]);
+ br[28] = vsubq_s32(ar[31], ar[28]);
+ bl[29] = vsubq_s32(al[30], al[29]);
+ br[29] = vsubq_s32(ar[30], ar[29]);
+ bl[30] = vaddq_s32(al[29], al[30]);
+ br[30] = vaddq_s32(ar[29], ar[30]);
+ bl[31] = vaddq_s32(al[28], al[31]);
+ br[31] = vaddq_s32(ar[28], ar[31]);
+
+ // Stage 6.
+ al[0] = bl[0];
+ ar[0] = br[0];
+ al[1] = bl[1];
+ ar[1] = br[1];
+ al[2] = bl[2];
+ ar[2] = br[2];
+ al[3] = bl[3];
+ ar[3] = br[3];
+
+ butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
+ &al[4], &ar[4], &al[7], &ar[7]);
+ butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
+ &al[5], &ar[5], &al[6], &ar[6]);
+
+ al[8] = vaddq_s32(bl[8], bl[9]);
+ ar[8] = vaddq_s32(br[8], br[9]);
+ al[9] = vsubq_s32(bl[8], bl[9]);
+ ar[9] = vsubq_s32(br[8], br[9]);
+ al[10] = vsubq_s32(bl[11], bl[10]);
+ ar[10] = vsubq_s32(br[11], br[10]);
+ al[11] = vaddq_s32(bl[11], bl[10]);
+ ar[11] = vaddq_s32(br[11], br[10]);
+ al[12] = vaddq_s32(bl[12], bl[13]);
+ ar[12] = vaddq_s32(br[12], br[13]);
+ al[13] = vsubq_s32(bl[12], bl[13]);
+ ar[13] = vsubq_s32(br[12], br[13]);
+ al[14] = vsubq_s32(bl[15], bl[14]);
+ ar[14] = vsubq_s32(br[15], br[14]);
+ al[15] = vaddq_s32(bl[15], bl[14]);
+ ar[15] = vaddq_s32(br[15], br[14]);
+
+ al[16] = bl[16];
+ ar[16] = br[16];
+ al[19] = bl[19];
+ ar[19] = br[19];
+ al[20] = bl[20];
+ ar[20] = br[20];
+ al[23] = bl[23];
+ ar[23] = br[23];
+ al[24] = bl[24];
+ ar[24] = br[24];
+ al[27] = bl[27];
+ ar[27] = br[27];
+ al[28] = bl[28];
+ ar[28] = br[28];
+ al[31] = bl[31];
+ ar[31] = br[31];
+
+ butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
+ cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
+ -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
+ cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
+ butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
+ -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
+
+ // Stage 7.
+ bl[0] = al[0];
+ br[0] = ar[0];
+ bl[1] = al[1];
+ br[1] = ar[1];
+ bl[2] = al[2];
+ br[2] = ar[2];
+ bl[3] = al[3];
+ br[3] = ar[3];
+ bl[4] = al[4];
+ br[4] = ar[4];
+ bl[5] = al[5];
+ br[5] = ar[5];
+ bl[6] = al[6];
+ br[6] = ar[6];
+ bl[7] = al[7];
+ br[7] = ar[7];
+
+ butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
+ &bl[8], &br[8], &bl[15], &br[15]);
+ butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
+ cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
+ butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
+ cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
+ butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
+ cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
+
+ bl[16] = vaddq_s32(al[16], al[17]);
+ br[16] = vaddq_s32(ar[16], ar[17]);
+ bl[17] = vsubq_s32(al[16], al[17]);
+ br[17] = vsubq_s32(ar[16], ar[17]);
+ bl[18] = vsubq_s32(al[19], al[18]);
+ br[18] = vsubq_s32(ar[19], ar[18]);
+ bl[19] = vaddq_s32(al[19], al[18]);
+ br[19] = vaddq_s32(ar[19], ar[18]);
+ bl[20] = vaddq_s32(al[20], al[21]);
+ br[20] = vaddq_s32(ar[20], ar[21]);
+ bl[21] = vsubq_s32(al[20], al[21]);
+ br[21] = vsubq_s32(ar[20], ar[21]);
+ bl[22] = vsubq_s32(al[23], al[22]);
+ br[22] = vsubq_s32(ar[23], ar[22]);
+ bl[23] = vaddq_s32(al[23], al[22]);
+ br[23] = vaddq_s32(ar[23], ar[22]);
+ bl[24] = vaddq_s32(al[24], al[25]);
+ br[24] = vaddq_s32(ar[24], ar[25]);
+ bl[25] = vsubq_s32(al[24], al[25]);
+ br[25] = vsubq_s32(ar[24], ar[25]);
+ bl[26] = vsubq_s32(al[27], al[26]);
+ br[26] = vsubq_s32(ar[27], ar[26]);
+ bl[27] = vaddq_s32(al[27], al[26]);
+ br[27] = vaddq_s32(ar[27], ar[26]);
+ bl[28] = vaddq_s32(al[28], al[29]);
+ br[28] = vaddq_s32(ar[28], ar[29]);
+ bl[29] = vsubq_s32(al[28], al[29]);
+ br[29] = vsubq_s32(ar[28], ar[29]);
+ bl[30] = vsubq_s32(al[31], al[30]);
+ br[30] = vsubq_s32(ar[31], ar[30]);
+ bl[31] = vaddq_s32(al[31], al[30]);
+ br[31] = vaddq_s32(ar[31], ar[30]);
+
+ // Final stage.
+ left[0] = bl[0];
+ right[0] = br[0];
+ left[16] = bl[1];
+ right[16] = br[1];
+ left[8] = bl[2];
+ right[8] = br[2];
+ left[24] = bl[3];
+ right[24] = br[3];
+ left[4] = bl[4];
+ right[4] = br[4];
+ left[20] = bl[5];
+ right[20] = br[5];
+ left[12] = bl[6];
+ right[12] = br[6];
+ left[28] = bl[7];
+ right[28] = br[7];
+ left[2] = bl[8];
+ right[2] = br[8];
+ left[18] = bl[9];
+ right[18] = br[9];
+ left[10] = bl[10];
+ right[10] = br[10];
+ left[26] = bl[11];
+ right[26] = br[11];
+ left[6] = bl[12];
+ right[6] = br[12];
+ left[22] = bl[13];
+ right[22] = br[13];
+ left[14] = bl[14];
+ right[14] = br[14];
+ left[30] = bl[15];
+ right[30] = br[15];
+
+ butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
+ cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
+ left[1] = al[1];
+ right[1] = ar[1];
+ left[31] = al[31];
+ right[31] = ar[31];
+
+ butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
+ cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
+ left[17] = al[17];
+ right[17] = ar[17];
+ left[15] = al[15];
+ right[15] = ar[15];
+
+ butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
+ cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
+ left[9] = al[9];
+ right[9] = ar[9];
+ left[23] = al[23];
+ right[23] = ar[23];
+
+ butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
+ cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
+ left[25] = al[25];
+ right[25] = ar[25];
+ left[7] = al[7];
+ right[7] = ar[7];
+
+ butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
+ cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
+ left[5] = al[5];
+ right[5] = ar[5];
+ left[27] = al[27];
+ right[27] = ar[27];
+
+ butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
+ cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
+ left[21] = al[21];
+ right[21] = ar[21];
+ left[11] = al[11];
+ right[11] = ar[11];
+
+ butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
+ cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
+ left[13] = al[13];
+ right[13] = ar[13];
+ left[19] = al[19];
+ right[19] = ar[19];
+
+ butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
+ cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
+ left[29] = al[29];
+ right[29] = ar[29];
+ left[3] = al[3];
+ right[3] = ar[3];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c
new file mode 100644
index 0000000000..3b9196fae9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // input[M * stride] * 16
+ int16x4_t in[4];
+ in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+ in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+ in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+ in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // If the very first value != 0, then add 1.
+ if (input[0] != 0) {
+ const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+ in[0] = vadd_s16(in[0], one);
+ }
+ vpx_fdct4x4_pass1_neon(in);
+ vpx_fdct4x4_pass2_neon(in);
+ {
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ const int16x8_t one = vdupq_n_s16(1);
+ int16x8_t out_01 = vcombine_s16(in[0], in[1]);
+ int16x8_t out_23 = vcombine_s16(in[2], in[3]);
+ out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+ out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+ store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+ store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ static const int32x4_t const_1000 = { 1, 0, 0, 0 };
+ const int32x4_t const_one = vdupq_n_s32(1);
+
+ // input[M * stride] * 16
+ int32x4_t in[4];
+ in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+ in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+ in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+ in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // If the very first value != 0, then add 1.
+ if (input[0] != 0) {
+ in[0] = vaddq_s32(in[0], const_1000);
+ }
+
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ {
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
+ in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2);
+ in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2);
+ in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2);
+
+ vst1q_s32(final_output, in[0]);
+ vst1q_s32(final_output + 4, in[1]);
+ vst1q_s32(final_output + 8, in[2]);
+ vst1q_s32(final_output + 12, in[3]);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h
new file mode 100644
index 0000000000..de3db9774c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+ int16x4_t out[4];
+
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+ butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
+ int16x4_t out[4];
+
+ const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+ const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+ butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
+ &out[2]);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+ int32x4_t out[4];
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+ const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+ const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+ const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+ butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+ // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+ butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
+ &out[1], &out[3]);
+
+ transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c
new file mode 100644
index 0000000000..75ee6f2230
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
+
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // stage 1
+ int16x8_t in[8];
+ in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+
+ vpx_fdct8x8_pass1_neon(in);
+ vpx_fdct8x8_pass2_neon(in);
+ {
+ // from vpx_dct_sse2.c
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+ in[0] = vhsubq_s16(in[0], sign_in0);
+ in[1] = vhsubq_s16(in[1], sign_in1);
+ in[2] = vhsubq_s16(in[2], sign_in2);
+ in[3] = vhsubq_s16(in[3], sign_in3);
+ in[4] = vhsubq_s16(in[4], sign_in4);
+ in[5] = vhsubq_s16(in[5], sign_in5);
+ in[6] = vhsubq_s16(in[6], sign_in6);
+ in[7] = vhsubq_s16(in[7], sign_in7);
+ // store results
+ store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+ store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+ store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+ store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+ store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+ store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+ store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+ store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // input[M * stride] * 16
+ int32x4_t left[8], right[8];
+ int16x8_t in[8];
+ in[0] = vld1q_s16(input + 0 * stride);
+ in[1] = vld1q_s16(input + 1 * stride);
+ in[2] = vld1q_s16(input + 2 * stride);
+ in[3] = vld1q_s16(input + 3 * stride);
+ in[4] = vld1q_s16(input + 4 * stride);
+ in[5] = vld1q_s16(input + 5 * stride);
+ in[6] = vld1q_s16(input + 6 * stride);
+ in[7] = vld1q_s16(input + 7 * stride);
+
+ left[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+ right[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+
+ vpx_highbd_fdct8x8_pass1_neon(left, right);
+ vpx_highbd_fdct8x8_pass2_neon(left, right);
+ {
+ left[0] = add_round_shift_half_s32(left[0]);
+ left[1] = add_round_shift_half_s32(left[1]);
+ left[2] = add_round_shift_half_s32(left[2]);
+ left[3] = add_round_shift_half_s32(left[3]);
+ left[4] = add_round_shift_half_s32(left[4]);
+ left[5] = add_round_shift_half_s32(left[5]);
+ left[6] = add_round_shift_half_s32(left[6]);
+ left[7] = add_round_shift_half_s32(left[7]);
+ right[0] = add_round_shift_half_s32(right[0]);
+ right[1] = add_round_shift_half_s32(right[1]);
+ right[2] = add_round_shift_half_s32(right[2]);
+ right[3] = add_round_shift_half_s32(right[3]);
+ right[4] = add_round_shift_half_s32(right[4]);
+ right[5] = add_round_shift_half_s32(right[5]);
+ right[6] = add_round_shift_half_s32(right[6]);
+ right[7] = add_round_shift_half_s32(right[7]);
+
+ // store results
+ vst1q_s32(final_output, left[0]);
+ vst1q_s32(final_output + 4, right[0]);
+ vst1q_s32(final_output + 8, left[1]);
+ vst1q_s32(final_output + 12, right[1]);
+ vst1q_s32(final_output + 16, left[2]);
+ vst1q_s32(final_output + 20, right[2]);
+ vst1q_s32(final_output + 24, left[3]);
+ vst1q_s32(final_output + 28, right[3]);
+ vst1q_s32(final_output + 32, left[4]);
+ vst1q_s32(final_output + 36, right[4]);
+ vst1q_s32(final_output + 40, left[5]);
+ vst1q_s32(final_output + 44, right[5]);
+ vst1q_s32(final_output + 48, left[6]);
+ vst1q_s32(final_output + 52, right[6]);
+ vst1q_s32(final_output + 56, left[7]);
+ vst1q_s32(final_output + 60, right[7]);
+ }
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h
new file mode 100644
index 0000000000..cc65157430
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t s[8], x[4], t[2];
+
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], t[0]);
+ x[1] = vsubq_s16(s[4], t[0]);
+ x[2] = vsubq_s16(s[7], t[1]);
+ x[3] = vaddq_s16(s[7], t[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
+ int16x8_t *out) {
+ int16x8_t s[8], x[4], t[2];
+
+ s[0] = vaddq_s16(in[0], in[7]);
+ s[1] = vaddq_s16(in[1], in[6]);
+ s[2] = vaddq_s16(in[2], in[5]);
+ s[3] = vaddq_s16(in[3], in[4]);
+ s[4] = vsubq_s16(in[3], in[4]);
+ s[5] = vsubq_s16(in[2], in[5]);
+ s[6] = vsubq_s16(in[1], in[6]);
+ s[7] = vsubq_s16(in[0], in[7]);
+ // fdct4(step, step);
+ x[0] = vaddq_s16(s[0], s[3]);
+ x[1] = vaddq_s16(s[1], s[2]);
+ x[2] = vsubq_s16(s[1], s[2]);
+ x[3] = vsubq_s16(s[0], s[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+ &out[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
+ &t[0]);
+
+ // Stage 3
+ x[0] = vaddq_s16(s[4], t[0]);
+ x[1] = vsubq_s16(s[4], t[0]);
+ x[2] = vsubq_s16(s[7], t[1]);
+ x[3] = vaddq_s16(s[7], t[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass1_notranspose_neon(in, out);
+ // transpose 8x8
+ transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+ in[4] = out[4];
+ in[5] = out[5];
+ in[6] = out[6];
+ in[7] = out[7];
+}
+
+static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
+ int16x8_t out[8];
+ vpx_fdct8x8_pass2_notranspose_neon(in, out);
+ // transpose 8x8
+ transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+ in[0] = out[0];
+ in[1] = out[1];
+ in[2] = out[2];
+ in[3] = out[3];
+ in[4] = out[4];
+ in[5] = out[5];
+ in[6] = out[6];
+ in[7] = out[7];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[4], &right[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
+ &left[2], &right[2], &left[6], &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+ &tr[1], &tl[0], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
+ &left[1], &right[1], &left[7], &right[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
+ &left[5], &right[5], &left[3], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+ // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+ butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+ &left[0], &right[0], &left[4], &right[4]);
+ // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+ // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+ cospi_24_64, &left[2], &right[2], &left[6],
+ &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ // t1 = (s6 + s5) * cospi_16_64;
+ butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+ &tr[1], &tl[0], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+ // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+ cospi_28_64, &left[1], &right[1], &left[7],
+ &right[7]);
+
+ // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+ // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+ butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+ cospi_12_64, &left[5], &right[5], &left[3],
+ &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+ int32x4_t *right) {
+ vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+ transpose_s32_8x8_2(left, right, left, right);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
+ int32x4_t *right) {
+ vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
+ transpose_s32_8x8_2(left, right, left, right);
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h
new file mode 100644
index 0000000000..16f5c5fc0e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT_NEON_H_
+
+#include <arm_neon.h>
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on half vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a,
+ const int16x4_t b,
+ const tran_coef_t constant,
+ int16x4_t *add,
+ int16x4_t *sub) {
+ int16x4_t c = vdup_n_s16(2 * constant);
+ *add = vqrdmulh_s16(vadd_s16(a, b), c);
+ *sub = vqrdmulh_s16(vsub_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on full vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a,
+ const int16x8_t b,
+ const tran_coef_t constant,
+ int16x8_t *add,
+ int16x8_t *sub) {
+ int16x8_t c = vdupq_n_s16(2 * constant);
+ *add = vqrdmulhq_s16(vaddq_s16(a, b), c);
+ *sub = vqrdmulhq_s16(vsubq_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ int32x4_t c = vdupq_n_s32(constant << 17);
+ const int16x4_t a_lo = vget_low_s16(a);
+ const int16x4_t a_hi = vget_high_s16(a);
+ const int16x4_t b_lo = vget_low_s16(b);
+ const int16x4_t b_hi = vget_high_s16(b);
+ *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c);
+ *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c);
+ *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c);
+ *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ int32x4_t add_lo, add_hi, sub_lo, sub_hi;
+ butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo,
+ &sub_hi);
+ *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi));
+ *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_half(
+ const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+ int32x4_t *add, int32x4_t *sub) {
+ int32x4_t c = vdupq_n_s32(constant << 17);
+ *add = vqrdmulhq_s32(vaddl_s16(a, b), c);
+ *sub = vqrdmulhq_s32(vsubl_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on half vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half(
+ const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+ int16x4_t *add, int16x4_t *sub) {
+ int32x4_t add32, sub32;
+ butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32);
+ *add = vmovn_s32(add32);
+ *sub = vmovn_s32(sub32);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+ const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_narrow(
+ const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi;
+ butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo,
+ &sub32_hi);
+ *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi));
+ *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant);
+ *add_lo = vmlaq_n_s32(a1, b_lo, constant);
+ *add_hi = vmlaq_n_s32(a2, b_hi, constant);
+ *sub_lo = vmlsq_n_s32(a3, b_lo, constant);
+ *sub_hi = vmlsq_n_s32(a4, b_hi, constant);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a,
+ const int32x4_t b,
+ const tran_coef_t constant,
+ int32x4_t *add,
+ int32x4_t *sub) {
+ const int32x4_t c = vdupq_n_s32(constant << 17);
+ *add = vqrdmulhq_s32(vaddq_s32(a, b), c);
+ *sub = vqrdmulhq_s32(vsubq_s32(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t c = vdupq_n_s32(constant << 17);
+ *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c);
+ *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c);
+ *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c);
+ *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_one_coeff_s32_s64_narrow(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ // ac holds the following values:
+ // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c,
+ // vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c
+ int64x2_t ac[4];
+ int64x2_t sum[4];
+ int64x2_t diff[4];
+
+ ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant);
+ ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant);
+ ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant);
+ ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant);
+
+ sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant);
+ sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant);
+ sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant);
+ sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant);
+ *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+ *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+ diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant);
+ diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant);
+ diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant);
+ diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant);
+ *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+ *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
+ const int32x4_t a, const int32x4_t b, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) {
+ const int32x2_t a_lo = vget_low_s32(a);
+ const int32x2_t a_hi = vget_high_s32(a);
+ const int32x2_t b_lo = vget_low_s32(b);
+ const int32x2_t b_hi = vget_high_s32(b);
+
+ const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1);
+ const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1);
+ const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2);
+ const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2);
+
+ const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2);
+ const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2);
+ const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1);
+ const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1);
+
+ *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+ vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+ *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+ vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 64-bit values
+// returns results without rounding
+static INLINE void butterfly_two_coeff_s32_s64_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/,
+ int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/,
+ int64x2_t *sub_hi /*[2]*/) {
+ // ac1/ac2 hold the following values:
+ // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+ // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+ // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+ // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+ int64x2_t ac1[4];
+ int64x2_t ac2[4];
+
+ ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+ ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+ ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+ ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+ ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+ ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+ ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+ ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+ add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+ add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+ add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+ add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+
+ sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+ sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+ sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+ sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ // ac1/ac2 hold the following values:
+ // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+ // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+ // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+ // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+ int64x2_t ac1[4];
+ int64x2_t ac2[4];
+ int64x2_t sum[4];
+ int64x2_t diff[4];
+
+ ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+ ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+ ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+ ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+ ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+ ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+ ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+ ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+ sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+ sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+ sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+ sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+ *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+ *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+ vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+ diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+ diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+ diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+ diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+ *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+ *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+ vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s16_s32_noround(
+ const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo,
+ const int16x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmull_n_s16(a_lo, constant1);
+ const int32x4_t a2 = vmull_n_s16(a_hi, constant1);
+ const int32x4_t a3 = vmull_n_s16(a_lo, constant2);
+ const int32x4_t a4 = vmull_n_s16(a_hi, constant2);
+ *add_lo = vmlal_n_s16(a1, b_lo, constant2);
+ *add_hi = vmlal_n_s16(a2, b_hi, constant2);
+ *sub_lo = vmlsl_n_s16(a3, b_lo, constant1);
+ *sub_hi = vmlsl_n_s16(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_noround(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+ *add_lo = vmlaq_n_s32(a1, b_lo, constant2);
+ *add_hi = vmlaq_n_s32(a2, b_hi, constant2);
+ *sub_lo = vmlsq_n_s32(a3, b_lo, constant1);
+ *sub_hi = vmlsq_n_s32(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_half(const int16x4_t a,
+ const int16x4_t b,
+ const tran_coef_t constant1,
+ const tran_coef_t constant2,
+ int16x4_t *add, int16x4_t *sub) {
+ const int32x4_t a1 = vmull_n_s16(a, constant1);
+ const int32x4_t a2 = vmull_n_s16(a, constant2);
+ const int32x4_t sum = vmlal_n_s16(a1, b, constant2);
+ const int32x4_t diff = vmlsl_n_s16(a2, b, constant1);
+ *add = vqrshrn_n_s32(sum, DCT_CONST_BITS);
+ *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_coef_t constant1,
+ const tran_coef_t constant2,
+ int16x8_t *add, int16x8_t *sub) {
+ const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1);
+ const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1);
+ const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2);
+ const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2);
+ const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2);
+ const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2);
+ const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1);
+ const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_coef_t constant1,
+ const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+ const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+ const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+ const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+ const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2);
+ const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2);
+ const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1);
+ const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+ const int16x8_t one = vdupq_n_s16(1);
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift and round,
+// return narrowed results
+static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo,
+ const int32x4_t a_hi) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+ const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+ const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+ const int16x4_t b_lo =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+ const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+ const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+ const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+ const int16x4_t b_hi =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+ return vcombine_s16(b_lo, b_hi);
+}
+
+// Add 1 if negative, and shift by 1.
+// In practice, add the sign bit, then shift and round
+static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+ const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+ const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+ return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
+}
+
+static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/,
+ const int64x2_t *b /*[2]*/) {
+ int64x2_t result[2];
+ result[0] = vaddq_s64(a[0], b[0]);
+ result[1] = vaddq_s64(a[1], b[1]);
+ return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+ vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/,
+ const int64x2_t *b /*[2]*/) {
+ int64x2_t result[2];
+ result[0] = vsubq_s64(a[0], b[0]);
+ result[1] = vsubq_s64(a[1], b[1]);
+ return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+ vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a,
+ const int32x4_t b) {
+ int64x2_t a64[2], b64[2], result[2];
+ a64[0] = vmovl_s32(vget_low_s32(a));
+ a64[1] = vmovl_s32(vget_high_s32(a));
+ b64[0] = vmovl_s32(vget_low_s32(b));
+ b64[1] = vmovl_s32(vget_high_s32(b));
+ result[0] = vaddq_s64(a64[0], b64[0]);
+ result[1] = vaddq_s64(a64[1], b64[1]);
+ return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a,
+ const int32x4_t b) {
+ int64x2_t a64[2], b64[2], result[2];
+ a64[0] = vmovl_s32(vget_low_s32(a));
+ a64[1] = vmovl_s32(vget_high_s32(a));
+ b64[0] = vmovl_s32(vget_low_s32(b));
+ b64[1] = vmovl_s32(vget_high_s32(b));
+ result[0] = vsubq_s64(a64[0], b64[0]);
+ result[1] = vsubq_s64(a64[1], b64[1]);
+ return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c
new file mode 100644
index 0000000000..718dba0d91
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int16x4_t a0, a1, a2, a3;
+ int16x8_t b0, b1;
+ int16x8_t c;
+
+ a0 = vld1_s16(input);
+ input += stride;
+ a1 = vld1_s16(input);
+ input += stride;
+ a2 = vld1_s16(input);
+ input += stride;
+ a3 = vld1_s16(input);
+
+ b0 = vcombine_s16(a0, a1);
+ b1 = vcombine_s16(a2, a3);
+
+ c = vaddq_s16(b0, b1);
+
+ output[0] = (tran_low_t)(horizontal_add_int16x8(c) << 1);
+ output[1] = 0;
+}
+
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int r;
+ int16x8_t sum = vld1q_s16(&input[0]);
+
+ for (r = 1; r < 8; ++r) {
+ const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+ sum = vaddq_s16(sum, input_00);
+ }
+
+ output[0] = (tran_low_t)horizontal_add_int16x8(sum);
+ output[1] = 0;
+}
+
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int r;
+ int16x8_t left = vld1q_s16(input);
+ int16x8_t right = vld1q_s16(input + 8);
+ int32_t sum;
+ input += stride;
+
+ for (r = 1; r < 16; ++r) {
+ const int16x8_t a = vld1q_s16(input);
+ const int16x8_t b = vld1q_s16(input + 8);
+ input += stride;
+ left = vaddq_s16(left, a);
+ right = vaddq_s16(right, b);
+ }
+
+ sum = horizontal_add_int16x8(left) + horizontal_add_int16x8(right);
+
+ output[0] = (tran_low_t)(sum >> 1);
+ output[1] = 0;
+}
+
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int r;
+ int16x8_t a0 = vld1q_s16(input);
+ int16x8_t a1 = vld1q_s16(input + 8);
+ int16x8_t a2 = vld1q_s16(input + 16);
+ int16x8_t a3 = vld1q_s16(input + 24);
+ int32_t sum;
+ input += stride;
+
+ for (r = 1; r < 32; ++r) {
+ const int16x8_t b0 = vld1q_s16(input);
+ const int16x8_t b1 = vld1q_s16(input + 8);
+ const int16x8_t b2 = vld1q_s16(input + 16);
+ const int16x8_t b3 = vld1q_s16(input + 24);
+ input += stride;
+ a0 = vaddq_s16(a0, b0);
+ a1 = vaddq_s16(a1, b1);
+ a2 = vaddq_s16(a2, b2);
+ a3 = vaddq_s16(a3, b3);
+ }
+
+ sum = horizontal_add_int16x8(a0);
+ sum += horizontal_add_int16x8(a1);
+ sum += horizontal_add_int16x8(a2);
+ sum += horizontal_add_int16x8(a3);
+ output[0] = (tran_low_t)(sum >> 3);
+ output[1] = 0;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+ int32_t sum;
+
+ int r = 0;
+ do {
+ const int16x8_t a = vld1q_s16(input);
+ const int16x8_t b = vld1q_s16(input + 8);
+ input += stride;
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b));
+ r++;
+ } while (r < 16);
+
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+ partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+ sum = horizontal_add_int32x4(partial_sum[0]);
+
+ output[0] = (tran_low_t)(sum >> 1);
+ output[1] = 0;
+}
+
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+ int stride) {
+ int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ int32_t sum;
+
+ int r = 0;
+ do {
+ const int16x8_t a0 = vld1q_s16(input);
+ const int16x8_t a1 = vld1q_s16(input + 8);
+ const int16x8_t a2 = vld1q_s16(input + 16);
+ const int16x8_t a3 = vld1q_s16(input + 24);
+ input += stride;
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0));
+ partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1));
+ partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2));
+ partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3));
+ partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3));
+ r++;
+ } while (r < 32);
+
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+ partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+ partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+ sum = horizontal_add_int32x4(partial_sum[0]);
+
+ output[0] = (tran_low_t)(sum >> 3);
+ output[1] = 0;
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c
new file mode 100644
index 0000000000..f6b6d7e3ce
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6, int16x8_t *a7) {
+ const int16x8_t b0 = vaddq_s16(*a0, *a1);
+ const int16x8_t b1 = vsubq_s16(*a0, *a1);
+ const int16x8_t b2 = vaddq_s16(*a2, *a3);
+ const int16x8_t b3 = vsubq_s16(*a2, *a3);
+ const int16x8_t b4 = vaddq_s16(*a4, *a5);
+ const int16x8_t b5 = vsubq_s16(*a4, *a5);
+ const int16x8_t b6 = vaddq_s16(*a6, *a7);
+ const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+ const int16x8_t c0 = vaddq_s16(b0, b2);
+ const int16x8_t c1 = vaddq_s16(b1, b3);
+ const int16x8_t c2 = vsubq_s16(b0, b2);
+ const int16x8_t c3 = vsubq_s16(b1, b3);
+ const int16x8_t c4 = vaddq_s16(b4, b6);
+ const int16x8_t c5 = vaddq_s16(b5, b7);
+ const int16x8_t c6 = vsubq_s16(b4, b6);
+ const int16x8_t c7 = vsubq_s16(b5, b7);
+
+ *a0 = vaddq_s16(c0, c4);
+ *a1 = vsubq_s16(c2, c6);
+ *a2 = vsubq_s16(c0, c4);
+ *a3 = vaddq_s16(c2, c6);
+ *a4 = vaddq_s16(c3, c7);
+ *a5 = vsubq_s16(c3, c7);
+ *a6 = vsubq_s16(c1, c5);
+ *a7 = vaddq_s16(c1, c5);
+}
+
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x8_t a0 = vld1q_s16(src_diff);
+ int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+ int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ // Skip the second transpose because it is not required.
+
+ store_s16q_to_tran_low(coeff + 0, a0);
+ store_s16q_to_tran_low(coeff + 8, a1);
+ store_s16q_to_tran_low(coeff + 16, a2);
+ store_s16q_to_tran_low(coeff + 24, a3);
+ store_s16q_to_tran_low(coeff + 32, a4);
+ store_s16q_to_tran_low(coeff + 40, a5);
+ store_s16q_to_tran_low(coeff + 48, a6);
+ store_s16q_to_tran_low(coeff + 56, a7);
+}
+
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int i;
+
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ vpx_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ vpx_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+ /* Bottom left. */
+ vpx_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+ /* Bottom right. */
+ vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+ for (i = 0; i < 64; i += 8) {
+ const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+ const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64);
+ const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128);
+ const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192);
+
+ const int16x8_t b0 = vhaddq_s16(a0, a1);
+ const int16x8_t b1 = vhsubq_s16(a0, a1);
+ const int16x8_t b2 = vhaddq_s16(a2, a3);
+ const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+ const int16x8_t c0 = vaddq_s16(b0, b2);
+ const int16x8_t c1 = vaddq_s16(b1, b3);
+ const int16x8_t c2 = vsubq_s16(b0, b2);
+ const int16x8_t c3 = vsubq_s16(b1, b3);
+
+ store_s16q_to_tran_low(coeff + 0, c0);
+ store_s16q_to_tran_low(coeff + 64, c1);
+ store_s16q_to_tran_low(coeff + 128, c2);
+ store_s16q_to_tran_low(coeff + 192, c3);
+
+ coeff += 8;
+ }
+}
+
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int i;
+
+ /* Rearrange 32x32 to 16x64 and remove stride.
+ * Top left first. */
+ vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+ coeff + 256);
+ /* Bottom left. */
+ vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+ coeff + 512);
+ /* Bottom right. */
+ vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+ coeff + 768);
+
+ for (i = 0; i < 256; i += 8) {
+ const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+ const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256);
+ const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
+ const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
+
+ const int16x8_t b0 = vhaddq_s16(a0, a1);
+ const int16x8_t b1 = vhsubq_s16(a0, a1);
+ const int16x8_t b2 = vhaddq_s16(a2, a3);
+ const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+ const int16x8_t c0 = vhaddq_s16(b0, b2);
+ const int16x8_t c1 = vhaddq_s16(b1, b3);
+ const int16x8_t c2 = vhsubq_s16(b0, b2);
+ const int16x8_t c3 = vhsubq_s16(b1, b3);
+
+ store_s16q_to_tran_low(coeff + 0, c0);
+ store_s16q_to_tran_low(coeff + 256, c1);
+ store_s16q_to_tran_low(coeff + 512, c2);
+ store_s16q_to_tran_low(coeff + 768, c3);
+
+ coeff += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c
new file mode 100644
index 0000000000..4265596c8c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+ const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p);
+ const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p);
+ return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4;
+}
+
+uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+ uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+ load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ sum = vaddq_u16(a0, a1);
+ sum = vaddq_u16(sum, a2);
+ sum = vaddq_u16(sum, a3);
+ sum = vaddq_u16(sum, a4);
+ sum = vaddq_u16(sum, a5);
+ sum = vaddq_u16(sum, a6);
+ sum = vaddq_u16(sum, a7);
+
+ return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
+}
+
+// coeff: 32 bits, dynamic range [-2147483648, 2147483647].
+// length: value range {16, 64, 256, 1024}.
+// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024]
+int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) {
+ int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ int32x4_t abs0, abs1;
+ const int32x4_t s0 = load_tran_low_to_s32q(coeff);
+ const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4);
+
+ abs0 = vabsq_s32(s0);
+ sum_s64[0] = vpadalq_s32(sum_s64[0], abs0);
+ abs1 = vabsq_s32(s1);
+ sum_s64[1] = vpadalq_s32(sum_s64[1], abs1);
+
+ length -= 8;
+ coeff += 8;
+ } while (length != 0);
+
+ return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1]));
+}
+
+void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
+ int dp, int *min, int *max) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
+
+ const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
+ const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
+ const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
+ const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
+ const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
+ const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
+ const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
+ const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
+
+ const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
+ const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
+ const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
+ const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
+ const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
+ const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
+ const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
+ const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
+
+ const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
+ const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
+ const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
+ const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
+ const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
+ const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
+ const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
+ const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
+
+ const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
+ const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
+ const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
+ const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
+
+ const uint16x8_t max0123 = vmaxq_u16(max01, max23);
+ const uint16x8_t max4567 = vmaxq_u16(max45, max67);
+ const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
+
+ const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
+ const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
+ const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
+ const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
+
+ const uint16x8_t min0123 = vminq_u16(min01, min23);
+ const uint16x8_t min4567 = vminq_u16(min45, min67);
+ const uint16x8_t min07 = vminq_u16(min0123, min4567);
+
+#if VPX_ARCH_AARCH64
+ *min = *max = 0; // Clear high bits
+ *((uint16_t *)max) = vmaxvq_u16(max07);
+ *((uint16_t *)min) = vminvq_u16(min07);
+#else
+ // Split into 64-bit vectors and execute pairwise min/max.
+ uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
+ uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
+
+ // Enough runs of vpmax/min propagate the max/min values to every position.
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u16((uint16_t *)max, ab_max, 0);
+ vst1_lane_u16((uint16_t *)min, ab_min, 0);
+#endif
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000000..3063acbb3e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i = height;
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t avg = vrhadd_u16(p, r);
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c
new file mode 100644
index 0000000000..499eb65462
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6,
+ int16x8_t *a7) {
+ int16x8_t b0 = vaddq_s16(*a0, *a1);
+ int16x8_t b1 = vsubq_s16(*a0, *a1);
+ int16x8_t b2 = vaddq_s16(*a2, *a3);
+ int16x8_t b3 = vsubq_s16(*a2, *a3);
+ int16x8_t b4 = vaddq_s16(*a4, *a5);
+ int16x8_t b5 = vsubq_s16(*a4, *a5);
+ int16x8_t b6 = vaddq_s16(*a6, *a7);
+ int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+ int16x8_t c0 = vaddq_s16(b0, b2);
+ int16x8_t c2 = vsubq_s16(b0, b2);
+ int16x8_t c1 = vaddq_s16(b1, b3);
+ int16x8_t c3 = vsubq_s16(b1, b3);
+ int16x8_t c4 = vaddq_s16(b4, b6);
+ int16x8_t c6 = vsubq_s16(b4, b6);
+ int16x8_t c5 = vaddq_s16(b5, b7);
+ int16x8_t c7 = vsubq_s16(b5, b7);
+
+ *a0 = vaddq_s16(c0, c4);
+ *a2 = vsubq_s16(c0, c4);
+ *a7 = vaddq_s16(c1, c5);
+ *a6 = vsubq_s16(c1, c5);
+ *a3 = vaddq_s16(c2, c6);
+ *a1 = vsubq_s16(c2, c6);
+ *a4 = vaddq_s16(c3, c7);
+ *a5 = vsubq_s16(c3, c7);
+}
+
+static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
+ int16x4_t a2, int16x4_t a3,
+ int16x4_t a4, int16x4_t a5,
+ int16x4_t a6, int16x4_t a7,
+ tran_low_t *coeff) {
+ int32x4_t b0 = vaddl_s16(a0, a1);
+ int32x4_t b1 = vsubl_s16(a0, a1);
+ int32x4_t b2 = vaddl_s16(a2, a3);
+ int32x4_t b3 = vsubl_s16(a2, a3);
+ int32x4_t b4 = vaddl_s16(a4, a5);
+ int32x4_t b5 = vsubl_s16(a4, a5);
+ int32x4_t b6 = vaddl_s16(a6, a7);
+ int32x4_t b7 = vsubl_s16(a6, a7);
+
+ int32x4_t c0 = vaddq_s32(b0, b2);
+ int32x4_t c2 = vsubq_s32(b0, b2);
+ int32x4_t c1 = vaddq_s32(b1, b3);
+ int32x4_t c3 = vsubq_s32(b1, b3);
+ int32x4_t c4 = vaddq_s32(b4, b6);
+ int32x4_t c6 = vsubq_s32(b4, b6);
+ int32x4_t c5 = vaddq_s32(b5, b7);
+ int32x4_t c7 = vsubq_s32(b5, b7);
+
+ int32x4_t d0 = vaddq_s32(c0, c4);
+ int32x4_t d2 = vsubq_s32(c0, c4);
+ int32x4_t d7 = vaddq_s32(c1, c5);
+ int32x4_t d6 = vsubq_s32(c1, c5);
+ int32x4_t d3 = vaddq_s32(c2, c6);
+ int32x4_t d1 = vsubq_s32(c2, c6);
+ int32x4_t d4 = vaddq_s32(c3, c7);
+ int32x4_t d5 = vsubq_s32(c3, c7);
+
+ store_s32q_to_tran_low(coeff + 0, d0);
+ store_s32q_to_tran_low(coeff + 4, d1);
+ store_s32q_to_tran_low(coeff + 8, d2);
+ store_s32q_to_tran_low(coeff + 12, d3);
+ store_s32q_to_tran_low(coeff + 16, d4);
+ store_s32q_to_tran_low(coeff + 20, d5);
+ store_s32q_to_tran_low(coeff + 24, d6);
+ store_s32q_to_tran_low(coeff + 28, d7);
+}
+
+void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x4_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+ int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride);
+ int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride);
+ int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
+ hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ // For the second pass we need to widen to 32-bit elements, so we're
+ // processing 4 columns at a time.
+ // Skip the second transpose because it is not required.
+
+ b0 = vget_low_s16(s0);
+ b1 = vget_low_s16(s1);
+ b2 = vget_low_s16(s2);
+ b3 = vget_low_s16(s3);
+ b4 = vget_low_s16(s4);
+ b5 = vget_low_s16(s5);
+ b6 = vget_low_s16(s6);
+ b7 = vget_low_s16(s7);
+
+ hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff);
+
+ b0 = vget_high_s16(s0);
+ b1 = vget_high_s16(s1);
+ b2 = vget_high_s16(s2);
+ b3 = vget_high_s16(s3);
+ b4 = vget_high_s16(s4);
+ b5 = vget_high_s16(s5);
+ b6 = vget_high_s16(s6);
+ b7 = vget_high_s16(s7);
+
+ hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32);
+}
+
+void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int i = 0;
+
+ // Rearrange 16x16 to 8x32 and remove stride.
+ // Top left first.
+ vpx_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff);
+ // Top right.
+ vpx_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64);
+ // Bottom left.
+ vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride,
+ coeff + 128);
+ // Bottom right.
+ vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride,
+ coeff + 192);
+
+ do {
+ int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+ int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 64);
+ int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 128);
+ int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 192);
+
+ int32x4_t b0 = vhaddq_s32(a0, a1);
+ int32x4_t b1 = vhsubq_s32(a0, a1);
+ int32x4_t b2 = vhaddq_s32(a2, a3);
+ int32x4_t b3 = vhsubq_s32(a2, a3);
+
+ int32x4_t c0 = vaddq_s32(b0, b2);
+ int32x4_t c1 = vaddq_s32(b1, b3);
+ int32x4_t c2 = vsubq_s32(b0, b2);
+ int32x4_t c3 = vsubq_s32(b1, b3);
+
+ store_s32q_to_tran_low(coeff + 4 * i, c0);
+ store_s32q_to_tran_low(coeff + 4 * i + 64, c1);
+ store_s32q_to_tran_low(coeff + 4 * i + 128, c2);
+ store_s32q_to_tran_low(coeff + 4 * i + 192, c3);
+ } while (++i < 16);
+}
+
+void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int i = 0;
+
+ // Rearrange 32x32 to 16x64 and remove stride.
+ // Top left first.
+ vpx_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff);
+ // Top right.
+ vpx_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256);
+ // Bottom left.
+ vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride,
+ coeff + 512);
+ // Bottom right.
+ vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride,
+ coeff + 768);
+
+ do {
+ int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+ int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 256);
+ int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512);
+ int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768);
+
+ int32x4_t b0 = vhaddq_s32(a0, a1);
+ int32x4_t b1 = vhsubq_s32(a0, a1);
+ int32x4_t b2 = vhaddq_s32(a2, a3);
+ int32x4_t b3 = vhsubq_s32(a2, a3);
+
+ int32x4_t c0 = vhaddq_s32(b0, b2);
+ int32x4_t c1 = vhaddq_s32(b1, b3);
+ int32x4_t c2 = vhsubq_s32(b0, b2);
+ int32x4_t c3 = vhsubq_s32(b1, b3);
+
+ store_s32q_to_tran_low(coeff + 4 * i, c0);
+ store_s32q_to_tran_low(coeff + 4 * i + 256, c1);
+ store_s32q_to_tran_low(coeff + 4 * i + 512, c2);
+ store_s32q_to_tran_low(coeff + 4 * i + 768, c3);
+ } while (++i < 64);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
new file mode 100644
index 0000000000..654ab42ca4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -0,0 +1,1361 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) {
+ int32x2x2_t t32;
+
+ t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS);
+ t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS);
+ return vcombine_s32(t32.val[0], t32.val[1]);
+}
+
+static INLINE void dct_const_round_shift_high_4_dual(
+ const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) {
+ *d0 = dct_const_round_shift_high_4(in[0]);
+ *d1 = dct_const_round_shift_high_4(in[1]);
+}
+
+static INLINE int32x4x2_t
+dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) {
+ int32x4x2_t out;
+ out.val[0] = dct_const_round_shift_high_4(in[0]);
+ out.val[1] = dct_const_round_shift_high_4(in[1]);
+ return out;
+}
+
+static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0);
+ *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2);
+}
+
+static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_2_30_10_22,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_4_12_20N_28,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_6_26N_14_18N,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 0);
+ t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26N_14_18N), 1);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_2_30_10_22,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_4_12_20N_28,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_6_26N_14_18N,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 0);
+ t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26N_14_18N), 1);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_q_kernel(
+ const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24,
+ int64x2x2_t *const t) {
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_d_kernel(
+ const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24,
+ int64x2x2_t *const t) {
+ t[0].val[0] =
+ vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] =
+ vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] =
+ vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] =
+ vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0),
+ vget_low_s32(cospi_0_8_16_24), 1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
+ const int32x4_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4_t *const d0,
+ int32x4_t *const d1) {
+ int64x2x2_t t[2];
+
+ highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
+ dct_const_round_shift_high_4_dual(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
+ t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]);
+ t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]);
+ t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]);
+ t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
+ const int32x4_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4_t *const d0,
+ int32x4_t *const d1) {
+ int64x2x2_t t[2];
+
+ highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
+ t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]);
+ t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]);
+ dct_const_round_shift_high_4_dual(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[6];
+
+ t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
+ const int32x4_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4_t *const d0,
+ int32x4_t *const d1) {
+ int64x2x2_t t[3];
+
+ t[2].val[0] =
+ vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
+ t[2].val[1] =
+ vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ dct_const_round_shift_high_4_dual(t, d0, d1);
+}
+
+static INLINE void highbd_idct16x16_add_stage7_dual(
+ const int32x4x2_t *const step2, int32x4x2_t *const out) {
+ out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]);
+ out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]);
+ out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]);
+ out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]);
+ out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]);
+ out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]);
+ out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]);
+ out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]);
+ out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]);
+ out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]);
+ out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]);
+ out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]);
+ out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]);
+ out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]);
+ out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]);
+ out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]);
+ out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]);
+ out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]);
+ out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]);
+ out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]);
+ out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]);
+ out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]);
+ out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]);
+ out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]);
+ out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]);
+ out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]);
+ out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]);
+ out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]);
+ out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]);
+ out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]);
+ out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]);
+ out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2,
+ int32x4_t *const out) {
+ out[0] = vaddq_s32(step2[0], step2[15]);
+ out[1] = vaddq_s32(step2[1], step2[14]);
+ out[2] = vaddq_s32(step2[2], step2[13]);
+ out[3] = vaddq_s32(step2[3], step2[12]);
+ out[4] = vaddq_s32(step2[4], step2[11]);
+ out[5] = vaddq_s32(step2[5], step2[10]);
+ out[6] = vaddq_s32(step2[6], step2[9]);
+ out[7] = vaddq_s32(step2[7], step2[8]);
+ out[8] = vsubq_s32(step2[7], step2[8]);
+ out[9] = vsubq_s32(step2[6], step2[9]);
+ out[10] = vsubq_s32(step2[5], step2[10]);
+ out[11] = vsubq_s32(step2[4], step2[11]);
+ out[12] = vsubq_s32(step2[3], step2[12]);
+ out[13] = vsubq_s32(step2[2], step2[13]);
+ out[14] = vsubq_s32(step2[1], step2[14]);
+ out[15] = vsubq_s32(step2[0], step2[15]);
+}
+
+void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
+ uint16_t *dest, const int stride,
+ const int bd) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4x2_t in[16], step1[16], step2[16], out[16];
+
+ // Load input (16x8)
+ in[0].val[0] = vld1q_s32(input);
+ in[0].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[8].val[0] = vld1q_s32(input);
+ in[8].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[1].val[0] = vld1q_s32(input);
+ in[1].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[9].val[0] = vld1q_s32(input);
+ in[9].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[2].val[0] = vld1q_s32(input);
+ in[2].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[10].val[0] = vld1q_s32(input);
+ in[10].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[3].val[0] = vld1q_s32(input);
+ in[3].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[11].val[0] = vld1q_s32(input);
+ in[11].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[4].val[0] = vld1q_s32(input);
+ in[4].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[12].val[0] = vld1q_s32(input);
+ in[12].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[5].val[0] = vld1q_s32(input);
+ in[5].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[13].val[0] = vld1q_s32(input);
+ in[13].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[6].val[0] = vld1q_s32(input);
+ in[6].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[14].val[0] = vld1q_s32(input);
+ in[14].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[7].val[0] = vld1q_s32(input);
+ in[7].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[15].val[0] = vld1q_s32(input);
+ in[15].val[1] = vld1q_s32(input + 4);
+
+ // Transpose
+ transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+ transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+ &in[15]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[1] = in[16 / 2];
+ step1[2] = in[8 / 2];
+ step1[3] = in[24 / 2];
+ step1[4] = in[4 / 2];
+ step1[5] = in[20 / 2];
+ step1[6] = in[12 / 2];
+ step1[7] = in[28 / 2];
+ step1[8] = in[2 / 2];
+ step1[9] = in[18 / 2];
+ step1[10] = in[10 / 2];
+ step1[11] = in[26 / 2];
+ step1[12] = in[6 / 2];
+ step1[13] = in[22 / 2];
+ step1[14] = in[14 / 2];
+ step1[15] = in[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+ highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8],
+ &step2[15]);
+ highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4],
+ &step1[7]);
+ highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5],
+ &step1[6]);
+ step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]);
+ step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]);
+ step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]);
+ step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]);
+ step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]);
+ step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]);
+ step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]);
+ step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]);
+ step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]);
+ step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]);
+ step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]);
+ step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]);
+ step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]);
+ step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]);
+ step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]);
+ step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]);
+
+ // stage 4
+ highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1],
+ &step2[0]);
+ highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2],
+ &step2[3]);
+ step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]);
+ step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]);
+ step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]);
+ step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]);
+ step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]);
+ step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]);
+ step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]);
+ step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]);
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]);
+ step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]);
+ step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]);
+ step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]);
+ step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]);
+ step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]);
+ step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]);
+ step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]);
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]);
+ step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]);
+ step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]);
+ step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]);
+ step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]);
+ step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]);
+ step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]);
+ step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]);
+ step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]);
+ step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]);
+ step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]);
+ step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]);
+ step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]);
+ step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]);
+ step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]);
+ step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]);
+
+ // stage 6
+ step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]);
+ step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]);
+ step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]);
+ step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]);
+ step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]);
+ step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]);
+ step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]);
+ step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]);
+ step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]);
+ step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]);
+ step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]);
+ step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]);
+ step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]);
+ step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]);
+ step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]);
+ step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]);
+ highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7_dual(step2, out);
+
+ if (output) {
+ highbd_idct16x16_store_pass1(out, output);
+ } else {
+ highbd_idct16x16_add_store(out, dest, stride, bd);
+ }
+}
+
+static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t[2];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0);
+ return dct_const_round_shift_high_4x2_int64x2x2(t);
+}
+
+static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t;
+
+ t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0);
+ t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0);
+ return dct_const_round_shift_high_4(t);
+}
+
+static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t[2];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1);
+ return dct_const_round_shift_high_4x2_int64x2x2(t);
+}
+
+static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t;
+
+ t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1);
+ t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1);
+ return dct_const_round_shift_high_4(t);
+}
+
+static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input,
+ int32_t *output, uint16_t *dest,
+ const int stride, const int bd) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4x2_t in[8], step1[16], step2[16], out[16];
+
+ // Load input (8x8)
+ in[0].val[0] = vld1q_s32(input);
+ in[0].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[1].val[0] = vld1q_s32(input);
+ in[1].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[2].val[0] = vld1q_s32(input);
+ in[2].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[3].val[0] = vld1q_s32(input);
+ in[3].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[4].val[0] = vld1q_s32(input);
+ in[4].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[5].val[0] = vld1q_s32(input);
+ in[5].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[6].val[0] = vld1q_s32(input);
+ in[6].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[7].val[0] = vld1q_s32(input);
+ in[7].val[1] = vld1q_s32(input + 4);
+
+ // Transpose
+ transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[2] = in[8 / 2];
+ step1[4] = in[4 / 2];
+ step1[6] = in[12 / 2];
+ step1[8] = in[2 / 2];
+ step1[10] = in[10 / 2];
+ step1[12] = in[6 / 2];
+ step1[14] = in[14 / 2]; // 0 in pass 1
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[4] = step1[4];
+ step2[6] = step1[6];
+ step2[8] =
+ highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+ step2[9] = highbd_idct_cospi_lane1_dual(step1[14],
+ vget_high_s32(cospi_6_26N_14_18N));
+ step2[10] =
+ highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
+ step2[11] =
+ highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[12] =
+ highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[13] =
+ highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
+ step2[14] = highbd_idct_cospi_lane0_dual(step1[14],
+ vget_high_s32(cospi_6_26N_14_18N));
+ step2[15] =
+ highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] =
+ highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
+ step1[5] =
+ highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28));
+ step1[6] =
+ highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28));
+ step1[7] =
+ highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
+ step1[8] = highbd_idct_add_dual(step2[8], step2[9]);
+ step1[9] = highbd_idct_sub_dual(step2[8], step2[9]);
+ step1[10] = highbd_idct_sub_dual(step2[11], step2[10]);
+ step1[11] = highbd_idct_add_dual(step2[11], step2[10]);
+ step1[12] = highbd_idct_add_dual(step2[12], step2[13]);
+ step1[13] = highbd_idct_sub_dual(step2[12], step2[13]);
+ step1[14] = highbd_idct_sub_dual(step2[15], step2[14]);
+ step1[15] = highbd_idct_add_dual(step2[15], step2[14]);
+
+ // stage 4
+ step2[0] = step2[1] =
+ highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
+ step2[2] =
+ highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24));
+ step2[3] =
+ highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24));
+ step2[4] = highbd_idct_add_dual(step1[4], step1[5]);
+ step2[5] = highbd_idct_sub_dual(step1[4], step1[5]);
+ step2[6] = highbd_idct_sub_dual(step1[7], step1[6]);
+ step2[7] = highbd_idct_add_dual(step1[7], step1[6]);
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = highbd_idct_add_dual(step2[0], step2[3]);
+ step1[1] = highbd_idct_add_dual(step2[1], step2[2]);
+ step1[2] = highbd_idct_sub_dual(step2[1], step2[2]);
+ step1[3] = highbd_idct_sub_dual(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
+ step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
+ step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
+ step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
+ step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
+ step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
+ step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
+ step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
+ step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
+ step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
+ step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
+ step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
+ step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
+ step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
+ step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
+ highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7_dual(step2, out);
+
+ if (output) {
+ highbd_idct16x16_store_pass1(out, output);
+ } else {
+ highbd_idct16x16_add_store(out, dest, stride, bd);
+ }
+}
+
+static void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+ int32_t *output) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x4)
+ in[0] = vld1q_s32(input);
+ input += 16;
+ in[1] = vld1q_s32(input);
+ input += 16;
+ in[2] = vld1q_s32(input);
+ input += 16;
+ in[3] = vld1q_s32(input);
+
+ // Transpose
+ transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22));
+ step2[11] =
+ highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[12] =
+ highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] =
+ highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28));
+ step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28));
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ step2[0] = step2[1] =
+ highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24));
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s32(step2[8], step2[11]);
+ step1[9] = vaddq_s32(step2[9], step2[10]);
+ step1[10] = vsubq_s32(step2[9], step2[10]);
+ step1[11] = vsubq_s32(step2[8], step2[11]);
+ step1[12] = vsubq_s32(step2[15], step2[12]);
+ step1[13] = vsubq_s32(step2[14], step2[13]);
+ step1[14] = vaddq_s32(step2[14], step2[13]);
+ step1[15] = vaddq_s32(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vaddq_s32(step1[0], step1[7]);
+ step2[1] = vaddq_s32(step1[1], step1[6]);
+ step2[2] = vaddq_s32(step1[2], step1[5]);
+ step2[3] = vaddq_s32(step1[3], step1[4]);
+ step2[4] = vsubq_s32(step1[3], step1[4]);
+ step2[5] = vsubq_s32(step1[2], step1[5]);
+ step2[6] = vsubq_s32(step1[1], step1[6]);
+ step2[7] = vsubq_s32(step1[0], step1[7]);
+ highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7(step2, out);
+
+ // pass 1: save the result into output
+ vst1q_s32(output, out[0]);
+ output += 4;
+ vst1q_s32(output, out[1]);
+ output += 4;
+ vst1q_s32(output, out[2]);
+ output += 4;
+ vst1q_s32(output, out[3]);
+ output += 4;
+ vst1q_s32(output, out[4]);
+ output += 4;
+ vst1q_s32(output, out[5]);
+ output += 4;
+ vst1q_s32(output, out[6]);
+ output += 4;
+ vst1q_s32(output, out[7]);
+ output += 4;
+ vst1q_s32(output, out[8]);
+ output += 4;
+ vst1q_s32(output, out[9]);
+ output += 4;
+ vst1q_s32(output, out[10]);
+ output += 4;
+ vst1q_s32(output, out[11]);
+ output += 4;
+ vst1q_s32(output, out[12]);
+ output += 4;
+ vst1q_s32(output, out[13]);
+ output += 4;
+ vst1q_s32(output, out[14]);
+ output += 4;
+ vst1q_s32(output, out[15]);
+}
+
+static void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
+ int32_t *const output,
+ uint16_t *const dest,
+ const int stride,
+ const int bd) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4x2_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x8)
+ in[0].val[0] = vld1q_s32(input);
+ input += 4;
+ in[0].val[1] = vld1q_s32(input);
+ input += 4;
+ in[1].val[0] = vld1q_s32(input);
+ input += 4;
+ in[1].val[1] = vld1q_s32(input);
+ input += 4;
+ in[2].val[0] = vld1q_s32(input);
+ input += 4;
+ in[2].val[1] = vld1q_s32(input);
+ input += 4;
+ in[3].val[0] = vld1q_s32(input);
+ input += 4;
+ in[3].val[1] = vld1q_s32(input);
+
+ // Transpose
+ transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1],
+ &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] =
+ highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+ step2[11] =
+ highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[12] =
+ highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+ step2[15] =
+ highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] =
+ highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
+ step1[7] =
+ highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ step2[0] = step2[1] =
+ highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
+ step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
+ step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
+ step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
+ step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
+ step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
+ step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
+ step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
+ step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
+ step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
+ step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
+ step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
+ step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
+ step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
+ step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
+ highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7_dual(step2, out);
+
+ if (output) {
+ highbd_idct16x16_store_pass1(out, output);
+ } else {
+ highbd_idct16x16_add_store(out, dest, stride, bd);
+ }
+}
+
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ if (bd == 8) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1);
+
+ // Parallel idct on the lower 8 rows
+ vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
+ stride, 1);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8,
+ stride, 1);
+ } else {
+ int32_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride,
+ bd);
+
+ // Parallel idct on the lower 8 rows
+ vpx_highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8,
+ dest, stride, bd);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride,
+ bd);
+
+ // Parallel idct to get the right 8 columns
+ vpx_highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL,
+ dest + 8, stride, bd);
+ }
+}
+
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ if (bd == 8) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
+ stride, 1);
+ } else {
+ int32_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride,
+ bd);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd);
+
+ // Parallel idct to get the right 8 columns
+ vpx_highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
+ stride, bd);
+ }
+}
+
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ if (bd == 8) {
+ int16_t row_idct_output[4 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+ stride, 1);
+ } else {
+ int32_t row_idct_output[4 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride,
+ bd);
+
+ // Parallel idct to get the right 8 columns
+ highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL,
+ dest + 8, stride, bd);
+ }
+}
+
+static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a0 = vld1q_u16(*dest + 0);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t c0 = vminq_s16(b0, max);
+ const int16x8_t c1 = vminq_s16(b1, max);
+ vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0));
+ vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a0 = vld1q_u16(*dest + 0);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+ const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+ vst1q_u16(*dest + 0, c0);
+ vst1q_u16(*dest + 8, c1);
+ *dest += stride;
+}
+
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ int i;
+
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ for (i = 0; i < 4; ++i) {
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ }
+ } else {
+ for (i = 0; i < 4; ++i) {
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
new file mode 100644
index 0000000000..5b36f73367
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
@@ -0,0 +1,640 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_from_transformed(const int32_t *const trans_buf,
+ const int first, const int second,
+ int32x4x2_t *const q0,
+ int32x4x2_t *const q1) {
+ q0->val[0] = vld1q_s32(trans_buf + first * 8);
+ q0->val[1] = vld1q_s32(trans_buf + first * 8 + 4);
+ q1->val[0] = vld1q_s32(trans_buf + second * 8);
+ q1->val[1] = vld1q_s32(trans_buf + second * 8 + 4);
+}
+
+static INLINE void load_from_output(const int32_t *const out, const int first,
+ const int second, int32x4x2_t *const q0,
+ int32x4x2_t *const q1) {
+ q0->val[0] = vld1q_s32(out + first * 32);
+ q0->val[1] = vld1q_s32(out + first * 32 + 4);
+ q1->val[0] = vld1q_s32(out + second * 32);
+ q1->val[1] = vld1q_s32(out + second * 32 + 4);
+}
+
+static INLINE void store_in_output(int32_t *const out, const int first,
+ const int second, const int32x4x2_t q0,
+ const int32x4x2_t q1) {
+ vst1q_s32(out + first * 32, q0.val[0]);
+ vst1q_s32(out + first * 32 + 4, q0.val[1]);
+ vst1q_s32(out + second * 32, q1.val[0]);
+ vst1q_s32(out + second * 32 + 4, q1.val[1]);
+}
+
+static INLINE void highbd_store_combine_results(
+ uint16_t *p1, uint16_t *p2, const int stride, const int32x4x2_t q0,
+ const int32x4x2_t q1, const int32x4x2_t q2, const int32x4x2_t q3,
+ const int16x8_t max) {
+ int16x8_t o[4];
+ uint16x8_t d[4];
+
+ d[0] = vld1q_u16(p1);
+ p1 += stride;
+ d[1] = vld1q_u16(p1);
+ d[3] = vld1q_u16(p2);
+ p2 -= stride;
+ d[2] = vld1q_u16(p2);
+
+ o[0] = vcombine_s16(vrshrn_n_s32(q0.val[0], 6), vrshrn_n_s32(q0.val[1], 6));
+ o[1] = vcombine_s16(vrshrn_n_s32(q1.val[0], 6), vrshrn_n_s32(q1.val[1], 6));
+ o[2] = vcombine_s16(vrshrn_n_s32(q2.val[0], 6), vrshrn_n_s32(q2.val[1], 6));
+ o[3] = vcombine_s16(vrshrn_n_s32(q3.val[0], 6), vrshrn_n_s32(q3.val[1], 6));
+
+ o[0] = vqaddq_s16(o[0], vreinterpretq_s16_u16(d[0]));
+ o[1] = vqaddq_s16(o[1], vreinterpretq_s16_u16(d[1]));
+ o[2] = vqaddq_s16(o[2], vreinterpretq_s16_u16(d[2]));
+ o[3] = vqaddq_s16(o[3], vreinterpretq_s16_u16(d[3]));
+ o[0] = vminq_s16(o[0], max);
+ o[1] = vminq_s16(o[1], max);
+ o[2] = vminq_s16(o[2], max);
+ o[3] = vminq_s16(o[3], max);
+ d[0] = vqshluq_n_s16(o[0], 0);
+ d[1] = vqshluq_n_s16(o[1], 0);
+ d[2] = vqshluq_n_s16(o[2], 0);
+ d[3] = vqshluq_n_s16(o[3], 0);
+
+ vst1q_u16(p1, d[1]);
+ p1 -= stride;
+ vst1q_u16(p1, d[0]);
+ vst1q_u16(p2, d[2]);
+ p2 += stride;
+ vst1q_u16(p2, d[3]);
+}
+
+static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1,
+ const int32_t first_const,
+ const int32_t second_const,
+ int32x4x2_t *const qOut0,
+ int32x4x2_t *const qOut1) {
+ int64x2x2_t q[4];
+ int32x2_t d[6];
+
+ // Note: using v{mul, mla, mls}l_n_s32 here slows down 35% with gcc 4.9.
+ d[4] = vdup_n_s32(first_const);
+ d[5] = vdup_n_s32(second_const);
+
+ q[0].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[4]);
+ q[0].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[4]);
+ q[1].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[4]);
+ q[1].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[4]);
+ q[0].val[0] = vmlsl_s32(q[0].val[0], vget_low_s32(qIn1.val[0]), d[5]);
+ q[0].val[1] = vmlsl_s32(q[0].val[1], vget_high_s32(qIn1.val[0]), d[5]);
+ q[1].val[0] = vmlsl_s32(q[1].val[0], vget_low_s32(qIn1.val[1]), d[5]);
+ q[1].val[1] = vmlsl_s32(q[1].val[1], vget_high_s32(qIn1.val[1]), d[5]);
+
+ q[2].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[5]);
+ q[2].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[5]);
+ q[3].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[5]);
+ q[3].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[5]);
+ q[2].val[0] = vmlal_s32(q[2].val[0], vget_low_s32(qIn1.val[0]), d[4]);
+ q[2].val[1] = vmlal_s32(q[2].val[1], vget_high_s32(qIn1.val[0]), d[4]);
+ q[3].val[0] = vmlal_s32(q[3].val[0], vget_low_s32(qIn1.val[1]), d[4]);
+ q[3].val[1] = vmlal_s32(q[3].val[1], vget_high_s32(qIn1.val[1]), d[4]);
+
+ qOut0->val[0] = vcombine_s32(vrshrn_n_s64(q[0].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(q[0].val[1], DCT_CONST_BITS));
+ qOut0->val[1] = vcombine_s32(vrshrn_n_s64(q[1].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(q[1].val[1], DCT_CONST_BITS));
+ qOut1->val[0] = vcombine_s32(vrshrn_n_s64(q[2].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(q[2].val[1], DCT_CONST_BITS));
+ qOut1->val[1] = vcombine_s32(vrshrn_n_s64(q[3].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS));
+}
+
+static INLINE void load_s32x4q_dual(const int32_t *in, int32x4x2_t *const s) {
+ s[0].val[0] = vld1q_s32(in);
+ s[0].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[1].val[0] = vld1q_s32(in);
+ s[1].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[2].val[0] = vld1q_s32(in);
+ s[2].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[3].val[0] = vld1q_s32(in);
+ s[3].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[4].val[0] = vld1q_s32(in);
+ s[4].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[5].val[0] = vld1q_s32(in);
+ s[5].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[6].val[0] = vld1q_s32(in);
+ s[6].val[1] = vld1q_s32(in + 4);
+ in += 32;
+ s[7].val[0] = vld1q_s32(in);
+ s[7].val[1] = vld1q_s32(in + 4);
+}
+
+static INLINE void transpose_and_store_s32_8x8(int32x4x2_t *const a,
+ int32_t **out) {
+ transpose_s32_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+
+ vst1q_s32(*out, a[0].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[0].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[1].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[1].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[2].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[2].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[3].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[3].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[4].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[4].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[5].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[5].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[6].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[6].val[1]);
+ *out += 4;
+ vst1q_s32(*out, a[7].val[0]);
+ *out += 4;
+ vst1q_s32(*out, a[7].val[1]);
+ *out += 4;
+}
+
+static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) {
+ int i;
+ int32x4x2_t s[8];
+
+ for (i = 0; i < 4; i++, input += 8) {
+ load_s32x4q_dual(input, s);
+ transpose_and_store_s32_8x8(s, &t_buf);
+ }
+}
+
+static INLINE void idct32_bands_end_1st_pass(int32_t *const out,
+ int32x4x2_t *const q) {
+ store_in_output(out, 16, 17, q[6], q[7]);
+ store_in_output(out, 14, 15, q[8], q[9]);
+
+ load_from_output(out, 30, 31, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ store_in_output(out, 30, 31, q[6], q[7]);
+ store_in_output(out, 0, 1, q[4], q[5]);
+
+ load_from_output(out, 12, 13, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[10], q[1]);
+ q[3] = highbd_idct_add_dual(q[11], q[0]);
+ q[4] = highbd_idct_sub_dual(q[11], q[0]);
+ q[5] = highbd_idct_sub_dual(q[10], q[1]);
+
+ load_from_output(out, 18, 19, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ store_in_output(out, 18, 19, q[6], q[7]);
+ store_in_output(out, 12, 13, q[8], q[9]);
+
+ load_from_output(out, 28, 29, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ store_in_output(out, 28, 29, q[6], q[7]);
+ store_in_output(out, 2, 3, q[4], q[5]);
+
+ load_from_output(out, 10, 11, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[12], q[1]);
+ q[3] = highbd_idct_add_dual(q[13], q[0]);
+ q[4] = highbd_idct_sub_dual(q[13], q[0]);
+ q[5] = highbd_idct_sub_dual(q[12], q[1]);
+
+ load_from_output(out, 20, 21, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ store_in_output(out, 20, 21, q[6], q[7]);
+ store_in_output(out, 10, 11, q[8], q[9]);
+
+ load_from_output(out, 26, 27, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ store_in_output(out, 26, 27, q[6], q[7]);
+ store_in_output(out, 4, 5, q[4], q[5]);
+
+ load_from_output(out, 8, 9, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[14], q[1]);
+ q[3] = highbd_idct_add_dual(q[15], q[0]);
+ q[4] = highbd_idct_sub_dual(q[15], q[0]);
+ q[5] = highbd_idct_sub_dual(q[14], q[1]);
+
+ load_from_output(out, 22, 23, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ store_in_output(out, 22, 23, q[6], q[7]);
+ store_in_output(out, 8, 9, q[8], q[9]);
+
+ load_from_output(out, 24, 25, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ store_in_output(out, 24, 25, q[6], q[7]);
+ store_in_output(out, 6, 7, q[4], q[5]);
+}
+
+static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out,
+ uint16_t *const dest,
+ const int stride,
+ const int16x8_t max,
+ int32x4x2_t *const q) {
+ uint16_t *dest0 = dest + 0 * stride;
+ uint16_t *dest1 = dest + 31 * stride;
+ uint16_t *dest2 = dest + 16 * stride;
+ uint16_t *dest3 = dest + 15 * stride;
+ const int str2 = stride << 1;
+
+ highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+ max);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 30, 31, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+ max);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 12, 13, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[10], q[1]);
+ q[3] = highbd_idct_add_dual(q[11], q[0]);
+ q[4] = highbd_idct_sub_dual(q[11], q[0]);
+ q[5] = highbd_idct_sub_dual(q[10], q[1]);
+
+ load_from_output(out, 18, 19, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+ max);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 28, 29, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+ max);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 10, 11, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[12], q[1]);
+ q[3] = highbd_idct_add_dual(q[13], q[0]);
+ q[4] = highbd_idct_sub_dual(q[13], q[0]);
+ q[5] = highbd_idct_sub_dual(q[12], q[1]);
+
+ load_from_output(out, 20, 21, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+ max);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 26, 27, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+ max);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 8, 9, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[14], q[1]);
+ q[3] = highbd_idct_add_dual(q[15], q[0]);
+ q[4] = highbd_idct_sub_dual(q[15], q[0]);
+ q[5] = highbd_idct_sub_dual(q[14], q[1]);
+
+ load_from_output(out, 22, 23, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+ highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+ max);
+
+ load_from_output(out, 24, 25, &q[0], &q[1]);
+ q[4] = highbd_idct_add_dual(q[2], q[1]);
+ q[5] = highbd_idct_add_dual(q[3], q[0]);
+ q[6] = highbd_idct_sub_dual(q[3], q[0]);
+ q[7] = highbd_idct_sub_dual(q[2], q[1]);
+ highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+ max);
+}
+
+static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
+ uint16_t *dst, const int stride,
+ const int bd) {
+ int i, idct32_pass_loop;
+ int32_t trans_buf[32 * 8];
+ int32_t pass1[32 * 32];
+ int32_t pass2[32 * 32];
+ int32_t *out;
+ int32x4x2_t q[16];
+
+ for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+ idct32_pass_loop++, input = pass1, out = pass2) {
+ for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop
+ idct32_transpose_pair(input, trans_buf);
+ input += 32 * 8;
+
+ // -----------------------------------------
+ // BLOCK A: 16-19,28-31
+ // -----------------------------------------
+ // generate 16,17,30,31
+ // part of stage 1
+ load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
+ // part of stage 2
+ q[4] = highbd_idct_add_dual(q[0], q[1]);
+ q[13] = highbd_idct_sub_dual(q[0], q[1]);
+ q[6] = highbd_idct_add_dual(q[2], q[3]);
+ q[14] = highbd_idct_sub_dual(q[2], q[3]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
+
+ // generate 18,19,28,29
+ // part of stage 1
+ load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
+ // part of stage 2
+ q[13] = highbd_idct_sub_dual(q[3], q[2]);
+ q[3] = highbd_idct_add_dual(q[3], q[2]);
+ q[14] = highbd_idct_sub_dual(q[1], q[0]);
+ q[2] = highbd_idct_add_dual(q[1], q[0]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
+ // part of stage 4
+ q[8] = highbd_idct_add_dual(q[4], q[2]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[10] = highbd_idct_add_dual(q[7], q[1]);
+ q[15] = highbd_idct_add_dual(q[6], q[3]);
+ q[13] = highbd_idct_sub_dual(q[5], q[0]);
+ q[14] = highbd_idct_sub_dual(q[7], q[1]);
+ store_in_output(out, 16, 31, q[8], q[15]);
+ store_in_output(out, 17, 30, q[9], q[10]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
+ store_in_output(out, 29, 18, q[1], q[0]);
+ // part of stage 4
+ q[13] = highbd_idct_sub_dual(q[4], q[2]);
+ q[14] = highbd_idct_sub_dual(q[6], q[3]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
+ store_in_output(out, 19, 28, q[4], q[6]);
+
+ // -----------------------------------------
+ // BLOCK B: 20-23,24-27
+ // -----------------------------------------
+ // generate 20,21,26,27
+ // part of stage 1
+ load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
+ // part of stage 2
+ q[13] = highbd_idct_sub_dual(q[0], q[1]);
+ q[0] = highbd_idct_add_dual(q[0], q[1]);
+ q[14] = highbd_idct_sub_dual(q[2], q[3]);
+ q[2] = highbd_idct_add_dual(q[2], q[3]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+
+ // generate 22,23,24,25
+ // part of stage 1
+ load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
+ // part of stage 2
+ q[14] = highbd_idct_sub_dual(q[4], q[5]);
+ q[5] = highbd_idct_add_dual(q[4], q[5]);
+ q[13] = highbd_idct_sub_dual(q[6], q[7]);
+ q[6] = highbd_idct_add_dual(q[6], q[7]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
+ // part of stage 4
+ q[10] = highbd_idct_add_dual(q[7], q[1]);
+ q[11] = highbd_idct_add_dual(q[5], q[0]);
+ q[12] = highbd_idct_add_dual(q[6], q[2]);
+ q[15] = highbd_idct_add_dual(q[4], q[3]);
+ // part of stage 6
+ load_from_output(out, 16, 17, &q[14], &q[13]);
+ q[8] = highbd_idct_add_dual(q[14], q[11]);
+ q[9] = highbd_idct_add_dual(q[13], q[10]);
+ q[13] = highbd_idct_sub_dual(q[13], q[10]);
+ q[11] = highbd_idct_sub_dual(q[14], q[11]);
+ store_in_output(out, 17, 16, q[9], q[8]);
+ load_from_output(out, 30, 31, &q[14], &q[9]);
+ q[8] = highbd_idct_sub_dual(q[9], q[12]);
+ q[10] = highbd_idct_add_dual(q[14], q[15]);
+ q[14] = highbd_idct_sub_dual(q[14], q[15]);
+ q[12] = highbd_idct_add_dual(q[9], q[12]);
+ store_in_output(out, 30, 31, q[10], q[12]);
+ // part of stage 7
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 25, 22, q[14], q[13]);
+ do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 24, 23, q[14], q[13]);
+ // part of stage 4
+ q[14] = highbd_idct_sub_dual(q[5], q[0]);
+ q[13] = highbd_idct_sub_dual(q[6], q[2]);
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
+ q[14] = highbd_idct_sub_dual(q[7], q[1]);
+ q[13] = highbd_idct_sub_dual(q[4], q[3]);
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
+ // part of stage 6
+ load_from_output(out, 18, 19, &q[14], &q[13]);
+ q[8] = highbd_idct_add_dual(q[14], q[1]);
+ q[9] = highbd_idct_add_dual(q[13], q[6]);
+ q[13] = highbd_idct_sub_dual(q[13], q[6]);
+ q[1] = highbd_idct_sub_dual(q[14], q[1]);
+ store_in_output(out, 18, 19, q[8], q[9]);
+ load_from_output(out, 28, 29, &q[8], &q[9]);
+ q[14] = highbd_idct_sub_dual(q[8], q[5]);
+ q[10] = highbd_idct_add_dual(q[8], q[5]);
+ q[11] = highbd_idct_add_dual(q[9], q[0]);
+ q[0] = highbd_idct_sub_dual(q[9], q[0]);
+ store_in_output(out, 28, 29, q[10], q[11]);
+ // part of stage 7
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 20, 27, q[13], q[14]);
+ do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
+ store_in_output(out, 21, 26, q[1], q[0]);
+
+ // -----------------------------------------
+ // BLOCK C: 8-10,11-15
+ // -----------------------------------------
+ // generate 8,9,14,15
+ // part of stage 2
+ load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
+ // part of stage 3
+ q[13] = highbd_idct_sub_dual(q[0], q[1]);
+ q[0] = highbd_idct_add_dual(q[0], q[1]);
+ q[14] = highbd_idct_sub_dual(q[2], q[3]);
+ q[2] = highbd_idct_add_dual(q[2], q[3]);
+ // part of stage 4
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
+
+ // generate 10,11,12,13
+ // part of stage 2
+ load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
+ // part of stage 3
+ q[14] = highbd_idct_sub_dual(q[4], q[5]);
+ q[5] = highbd_idct_add_dual(q[4], q[5]);
+ q[13] = highbd_idct_sub_dual(q[6], q[7]);
+ q[6] = highbd_idct_add_dual(q[6], q[7]);
+ // part of stage 4
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
+ // part of stage 5
+ q[8] = highbd_idct_add_dual(q[0], q[5]);
+ q[9] = highbd_idct_add_dual(q[1], q[7]);
+ q[13] = highbd_idct_sub_dual(q[1], q[7]);
+ q[14] = highbd_idct_sub_dual(q[3], q[4]);
+ q[10] = highbd_idct_add_dual(q[3], q[4]);
+ q[15] = highbd_idct_add_dual(q[2], q[6]);
+ store_in_output(out, 8, 15, q[8], q[15]);
+ store_in_output(out, 9, 14, q[9], q[10]);
+ // part of stage 6
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+ store_in_output(out, 13, 10, q[3], q[1]);
+ q[13] = highbd_idct_sub_dual(q[0], q[5]);
+ q[14] = highbd_idct_sub_dual(q[2], q[6]);
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+ store_in_output(out, 11, 12, q[1], q[3]);
+
+ // -----------------------------------------
+ // BLOCK D: 0-3,4-7
+ // -----------------------------------------
+ // generate 4,5,6,7
+ // part of stage 3
+ load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+ // part of stage 4
+ q[13] = highbd_idct_sub_dual(q[0], q[1]);
+ q[0] = highbd_idct_add_dual(q[0], q[1]);
+ q[14] = highbd_idct_sub_dual(q[2], q[3]);
+ q[2] = highbd_idct_add_dual(q[2], q[3]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+
+ // generate 0,1,2,3
+ // part of stage 4
+ load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
+ // part of stage 5
+ q[4] = highbd_idct_add_dual(q[7], q[6]);
+ q[7] = highbd_idct_sub_dual(q[7], q[6]);
+ q[6] = highbd_idct_sub_dual(q[5], q[14]);
+ q[5] = highbd_idct_add_dual(q[5], q[14]);
+ // part of stage 6
+ q[8] = highbd_idct_add_dual(q[4], q[2]);
+ q[9] = highbd_idct_add_dual(q[5], q[3]);
+ q[10] = highbd_idct_add_dual(q[6], q[1]);
+ q[11] = highbd_idct_add_dual(q[7], q[0]);
+ q[12] = highbd_idct_sub_dual(q[7], q[0]);
+ q[13] = highbd_idct_sub_dual(q[6], q[1]);
+ q[14] = highbd_idct_sub_dual(q[5], q[3]);
+ q[15] = highbd_idct_sub_dual(q[4], q[2]);
+ // part of stage 7
+ load_from_output(out, 14, 15, &q[0], &q[1]);
+ q[2] = highbd_idct_add_dual(q[8], q[1]);
+ q[3] = highbd_idct_add_dual(q[9], q[0]);
+ q[4] = highbd_idct_sub_dual(q[9], q[0]);
+ q[5] = highbd_idct_sub_dual(q[8], q[1]);
+ load_from_output(out, 16, 17, &q[0], &q[1]);
+ q[8] = highbd_idct_add_dual(q[4], q[1]);
+ q[9] = highbd_idct_add_dual(q[5], q[0]);
+ q[6] = highbd_idct_sub_dual(q[5], q[0]);
+ q[7] = highbd_idct_sub_dual(q[4], q[1]);
+
+ if (idct32_pass_loop == 0) {
+ idct32_bands_end_1st_pass(out, q);
+ } else {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ idct32_bands_end_2nd_pass(out, dst, stride, max, q);
+ dst += 8;
+ }
+ }
+ }
+}
+
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ if (bd == 8) {
+ vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);
+ } else {
+ vpx_highbd_idct32_32_neon(input, dest, stride, bd);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
new file mode 100644
index 0000000000..6750c1a426
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_8x8_s32_dual(
+ const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1,
+ int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4,
+ int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) {
+ in0->val[0] = vld1q_s32(input);
+ in0->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in1->val[0] = vld1q_s32(input);
+ in1->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in2->val[0] = vld1q_s32(input);
+ in2->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in3->val[0] = vld1q_s32(input);
+ in3->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in4->val[0] = vld1q_s32(input);
+ in4->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in5->val[0] = vld1q_s32(input);
+ in5->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in6->val[0] = vld1q_s32(input);
+ in6->val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in7->val[0] = vld1q_s32(input);
+ in7->val[1] = vld1q_s32(input + 4);
+}
+
+static INLINE void load_4x8_s32_dual(const tran_low_t *input,
+ int32x4_t *const in0, int32x4_t *const in1,
+ int32x4_t *const in2, int32x4_t *const in3,
+ int32x4_t *const in4, int32x4_t *const in5,
+ int32x4_t *const in6,
+ int32x4_t *const in7) {
+ *in0 = vld1q_s32(input);
+ input += 32;
+ *in1 = vld1q_s32(input);
+ input += 32;
+ *in2 = vld1q_s32(input);
+ input += 32;
+ *in3 = vld1q_s32(input);
+ input += 32;
+ *in4 = vld1q_s32(input);
+ input += 32;
+ *in5 = vld1q_s32(input);
+ input += 32;
+ *in6 = vld1q_s32(input);
+ input += 32;
+ *in7 = vld1q_s32(input);
+}
+
+// Only for the first pass of the _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// 0 0 2 5 10 17 25 38 47 62 83 101 121
+// 1 1 4 8 15 22 30 45 58 74 92 112 133
+// 2 3 7 12 18 28 36 52 64 82 102 118
+// 3 6 11 16 23 31 43 60 73 90 109 126
+// 4 9 14 19 29 37 50 65 78 98 116 134
+// 5 13 20 26 35 44 54 72 85 105 123
+// 6 21 27 33 42 53 63 80 94 113 132
+// 7 24 32 39 48 57 71 88 104 120
+// 8 34 40 46 56 68 81 96 111 130
+// 9 41 49 55 67 77 91 107 124
+// 10 51 59 66 76 89 99 119 131
+// 11 61 69 75 87 100 114 129
+// 12 70 79 86 97 108 122
+// 13 84 93 103 110 125
+// 14 98 106 115 127
+// 15 117 128
+static void vpx_highbd_idct32_12_neon(const tran_low_t *const input,
+ int32_t *output) {
+ int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+ s8[32];
+
+ load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5],
+ &in[6], &in[7]);
+ transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0],
+ &in[9].val[1], &in[10].val[0], &in[10].val[1],
+ &in[11].val[0], &in[11].val[1]);
+ transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1],
+ &in[10].val[0], &in[10].val[1], &in[11].val[0],
+ &in[11].val[1]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+ s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
+ s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
+
+ s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+ s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
+ s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
+
+ s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+ s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
+ s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
+
+ s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+ s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
+ s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
+ s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
+ s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
+ s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
+ s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
+ s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
+ s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
+
+ // stage 3
+ s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+ s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+ s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
+ s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
+ s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
+ s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
+
+ s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+ s1[31], cospi_28_64);
+ s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+ s1[31], cospi_4_64);
+
+ s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
+ s2[29], -cospi_4_64);
+ s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
+ s2[29], cospi_28_64);
+
+ s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
+ s2[26], cospi_12_64);
+ s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
+ s2[26], cospi_20_64);
+
+ s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+ s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
+ s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
+
+ s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+ s2[15], cospi_24_64);
+ s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+ s2[15], cospi_8_64);
+
+ s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
+ s3[13], -cospi_8_64);
+ s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
+ s3[13], cospi_24_64);
+
+ s4[16] = highbd_idct_add_dual(s1[16], s2[19]);
+ s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
+ s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
+ s4[19] = highbd_idct_sub_dual(s1[16], s2[19]);
+ s4[20] = highbd_idct_sub_dual(s1[23], s2[20]);
+ s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
+ s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
+ s4[23] = highbd_idct_add_dual(s2[20], s1[23]);
+ s4[24] = highbd_idct_add_dual(s1[24], s2[27]);
+ s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
+ s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
+ s4[27] = highbd_idct_sub_dual(s1[24], s2[27]);
+ s4[28] = highbd_idct_sub_dual(s1[31], s2[28]);
+ s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
+ s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
+ s4[31] = highbd_idct_add_dual(s2[28], s1[31]);
+
+ // stage 5
+ s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
+ s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
+ s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
+ s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
+
+ s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64);
+ s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64);
+
+ s5[8] = highbd_idct_add_dual(s2[8], s3[11]);
+ s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
+ s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
+ s5[11] = highbd_idct_sub_dual(s2[8], s3[11]);
+ s5[12] = highbd_idct_sub_dual(s2[15], s3[12]);
+ s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
+ s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
+ s5[15] = highbd_idct_add_dual(s2[15], s3[12]);
+
+ s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
+ s4[29], cospi_24_64);
+ s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
+ s4[29], cospi_8_64);
+
+ s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
+ s4[28], cospi_24_64);
+ s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
+ s4[28], cospi_8_64);
+
+ s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
+ s4[27], -cospi_8_64);
+ s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
+ s4[27], cospi_24_64);
+
+ s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
+ s4[26], -cospi_8_64);
+ s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
+ s4[26], cospi_24_64);
+
+ // stage 6
+ s6[0] = highbd_idct_add_dual(s5[0], s3[7]);
+ s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
+ s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
+ s6[3] = highbd_idct_add_dual(s5[3], s3[4]);
+ s6[4] = highbd_idct_sub_dual(s5[3], s3[4]);
+ s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
+ s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
+ s6[7] = highbd_idct_sub_dual(s5[0], s3[7]);
+
+ s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
+ s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
+
+ s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
+ s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
+
+ s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
+ s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
+ s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
+ s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
+ s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
+ s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
+ s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
+ s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
+
+ s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
+ s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
+ s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
+ s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
+ s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
+ s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
+ s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
+ s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
+
+ // stage 7
+ s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
+ s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
+ s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
+ s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
+ s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
+ s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
+ s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
+ s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
+ s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
+ s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
+ s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
+ s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
+ s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
+ s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
+ s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
+ s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
+
+ s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
+ s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
+
+ s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
+ s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
+
+ s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
+ s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
+
+ s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
+ s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
+
+ // final stage
+ s8[0] = highbd_idct_add_dual(s7[0], s6[31]);
+ s8[1] = highbd_idct_add_dual(s7[1], s6[30]);
+ s8[2] = highbd_idct_add_dual(s7[2], s6[29]);
+ s8[3] = highbd_idct_add_dual(s7[3], s6[28]);
+ s8[4] = highbd_idct_add_dual(s7[4], s7[27]);
+ s8[5] = highbd_idct_add_dual(s7[5], s7[26]);
+ s8[6] = highbd_idct_add_dual(s7[6], s7[25]);
+ s8[7] = highbd_idct_add_dual(s7[7], s7[24]);
+ s8[8] = highbd_idct_add_dual(s7[8], s7[23]);
+ s8[9] = highbd_idct_add_dual(s7[9], s7[22]);
+ s8[10] = highbd_idct_add_dual(s7[10], s7[21]);
+ s8[11] = highbd_idct_add_dual(s7[11], s7[20]);
+ s8[12] = highbd_idct_add_dual(s7[12], s6[19]);
+ s8[13] = highbd_idct_add_dual(s7[13], s6[18]);
+ s8[14] = highbd_idct_add_dual(s7[14], s6[17]);
+ s8[15] = highbd_idct_add_dual(s7[15], s6[16]);
+ s8[16] = highbd_idct_sub_dual(s7[15], s6[16]);
+ s8[17] = highbd_idct_sub_dual(s7[14], s6[17]);
+ s8[18] = highbd_idct_sub_dual(s7[13], s6[18]);
+ s8[19] = highbd_idct_sub_dual(s7[12], s6[19]);
+ s8[20] = highbd_idct_sub_dual(s7[11], s7[20]);
+ s8[21] = highbd_idct_sub_dual(s7[10], s7[21]);
+ s8[22] = highbd_idct_sub_dual(s7[9], s7[22]);
+ s8[23] = highbd_idct_sub_dual(s7[8], s7[23]);
+ s8[24] = highbd_idct_sub_dual(s7[7], s7[24]);
+ s8[25] = highbd_idct_sub_dual(s7[6], s7[25]);
+ s8[26] = highbd_idct_sub_dual(s7[5], s7[26]);
+ s8[27] = highbd_idct_sub_dual(s7[4], s7[27]);
+ s8[28] = highbd_idct_sub_dual(s7[3], s6[28]);
+ s8[29] = highbd_idct_sub_dual(s7[2], s6[29]);
+ s8[30] = highbd_idct_sub_dual(s7[1], s6[30]);
+ s8[31] = highbd_idct_sub_dual(s7[0], s6[31]);
+
+ vst1q_s32(output + 0, s8[0].val[0]);
+ vst1q_s32(output + 4, s8[0].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[1].val[0]);
+ vst1q_s32(output + 4, s8[1].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[2].val[0]);
+ vst1q_s32(output + 4, s8[2].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[3].val[0]);
+ vst1q_s32(output + 4, s8[3].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[4].val[0]);
+ vst1q_s32(output + 4, s8[4].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[5].val[0]);
+ vst1q_s32(output + 4, s8[5].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[6].val[0]);
+ vst1q_s32(output + 4, s8[6].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[7].val[0]);
+ vst1q_s32(output + 4, s8[7].val[1]);
+ output += 16;
+
+ vst1q_s32(output + 0, s8[8].val[0]);
+ vst1q_s32(output + 4, s8[8].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[9].val[0]);
+ vst1q_s32(output + 4, s8[9].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[10].val[0]);
+ vst1q_s32(output + 4, s8[10].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[11].val[0]);
+ vst1q_s32(output + 4, s8[11].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[12].val[0]);
+ vst1q_s32(output + 4, s8[12].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[13].val[0]);
+ vst1q_s32(output + 4, s8[13].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[14].val[0]);
+ vst1q_s32(output + 4, s8[14].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[15].val[0]);
+ vst1q_s32(output + 4, s8[15].val[1]);
+ output += 16;
+
+ vst1q_s32(output + 0, s8[16].val[0]);
+ vst1q_s32(output + 4, s8[16].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[17].val[0]);
+ vst1q_s32(output + 4, s8[17].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[18].val[0]);
+ vst1q_s32(output + 4, s8[18].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[19].val[0]);
+ vst1q_s32(output + 4, s8[19].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[20].val[0]);
+ vst1q_s32(output + 4, s8[20].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[21].val[0]);
+ vst1q_s32(output + 4, s8[21].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[22].val[0]);
+ vst1q_s32(output + 4, s8[22].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[23].val[0]);
+ vst1q_s32(output + 4, s8[23].val[1]);
+ output += 16;
+
+ vst1q_s32(output + 0, s8[24].val[0]);
+ vst1q_s32(output + 4, s8[24].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[25].val[0]);
+ vst1q_s32(output + 4, s8[25].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[26].val[0]);
+ vst1q_s32(output + 4, s8[26].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[27].val[0]);
+ vst1q_s32(output + 4, s8[27].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[28].val[0]);
+ vst1q_s32(output + 4, s8[28].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[29].val[0]);
+ vst1q_s32(output + 4, s8[29].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[30].val[0]);
+ vst1q_s32(output + 4, s8[30].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, s8[31].val[0]);
+ vst1q_s32(output + 4, s8[31].val[1]);
+}
+
+static void vpx_highbd_idct32_16_neon(const int32_t *const input,
+ uint16_t *const output, const int stride,
+ const int bd) {
+ int32x4x2_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+ out[32];
+
+ load_and_transpose_s32_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
+ &in[5], &in[6], &in[7]);
+
+ load_and_transpose_s32_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
+ &in[12], &in[13], &in[14], &in[15]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+ s1[17] = multiply_shift_and_narrow_s32_dual(in[15], -cospi_17_64);
+ s1[30] = multiply_shift_and_narrow_s32_dual(in[15], cospi_15_64);
+
+ s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
+ s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
+
+ s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+ s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
+ s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
+
+ s1[22] = multiply_shift_and_narrow_s32_dual(in[13], cospi_19_64);
+ s1[25] = multiply_shift_and_narrow_s32_dual(in[13], cospi_13_64);
+
+ s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+ s2[9] = multiply_shift_and_narrow_s32_dual(in[14], -cospi_18_64);
+ s2[14] = multiply_shift_and_narrow_s32_dual(in[14], cospi_14_64);
+
+ s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
+ s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
+
+ s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+ s2[16] = highbd_idct_add_dual(s1[16], s1[17]);
+ s2[17] = highbd_idct_sub_dual(s1[16], s1[17]);
+ s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
+ s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
+ s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
+ s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
+ s2[22] = highbd_idct_sub_dual(s1[23], s1[22]);
+ s2[23] = highbd_idct_add_dual(s1[22], s1[23]);
+ s2[24] = highbd_idct_add_dual(s1[24], s1[25]);
+ s2[25] = highbd_idct_sub_dual(s1[24], s1[25]);
+ s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
+ s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
+ s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
+ s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
+ s2[30] = highbd_idct_sub_dual(s1[31], s1[30]);
+ s2[31] = highbd_idct_add_dual(s1[30], s1[31]);
+
+ // stage 3
+ s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+ s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+ s3[5] = multiply_shift_and_narrow_s32_dual(in[12], -cospi_20_64);
+ s3[6] = multiply_shift_and_narrow_s32_dual(in[12], cospi_12_64);
+
+ s3[8] = highbd_idct_add_dual(s2[8], s2[9]);
+ s3[9] = highbd_idct_sub_dual(s2[8], s2[9]);
+ s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
+ s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
+ s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
+ s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
+ s3[14] = highbd_idct_sub_dual(s2[15], s2[14]);
+ s3[15] = highbd_idct_add_dual(s2[14], s2[15]);
+
+ s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], -cospi_4_64,
+ s2[30], cospi_28_64);
+ s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], cospi_28_64,
+ s2[30], cospi_4_64);
+
+ s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
+ s2[29], -cospi_4_64);
+ s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
+ s2[29], cospi_28_64);
+
+ s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
+ s2[26], cospi_12_64);
+ s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
+ s2[26], cospi_20_64);
+
+ s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_12_64,
+ s2[25], -cospi_20_64);
+ s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_20_64,
+ s2[25], cospi_12_64);
+
+ // stage 4
+ s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+ s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
+ s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
+
+ s4[4] = highbd_idct_add_dual(s3[4], s3[5]);
+ s4[5] = highbd_idct_sub_dual(s3[4], s3[5]);
+ s4[6] = highbd_idct_sub_dual(s3[7], s3[6]);
+ s4[7] = highbd_idct_add_dual(s3[6], s3[7]);
+
+ s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], -cospi_8_64,
+ s3[14], cospi_24_64);
+ s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], cospi_24_64,
+ s3[14], cospi_8_64);
+
+ s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
+ s3[13], -cospi_8_64);
+ s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
+ s3[13], cospi_24_64);
+
+ s4[16] = highbd_idct_add_dual(s2[16], s2[19]);
+ s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
+ s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
+ s4[19] = highbd_idct_sub_dual(s2[16], s2[19]);
+ s4[20] = highbd_idct_sub_dual(s2[23], s2[20]);
+ s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
+ s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
+ s4[23] = highbd_idct_add_dual(s2[20], s2[23]);
+ s4[24] = highbd_idct_add_dual(s2[24], s2[27]);
+ s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
+ s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
+ s4[27] = highbd_idct_sub_dual(s2[24], s2[27]);
+ s4[28] = highbd_idct_sub_dual(s2[31], s2[28]);
+ s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
+ s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
+ s4[31] = highbd_idct_add_dual(s2[28], s2[31]);
+
+ // stage 5
+ s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
+ s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
+ s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
+ s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
+
+ s5[5] = sub_multiply_shift_and_narrow_s32_dual(s4[6], s4[5], cospi_16_64);
+ s5[6] = add_multiply_shift_and_narrow_s32_dual(s4[5], s4[6], cospi_16_64);
+
+ s5[8] = highbd_idct_add_dual(s3[8], s3[11]);
+ s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
+ s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
+ s5[11] = highbd_idct_sub_dual(s3[8], s3[11]);
+ s5[12] = highbd_idct_sub_dual(s3[15], s3[12]);
+ s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
+ s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
+ s5[15] = highbd_idct_add_dual(s3[15], s3[12]);
+
+ s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
+ s4[29], cospi_24_64);
+ s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
+ s4[29], cospi_8_64);
+
+ s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
+ s4[28], cospi_24_64);
+ s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
+ s4[28], cospi_8_64);
+
+ s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
+ s4[27], -cospi_8_64);
+ s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
+ s4[27], cospi_24_64);
+
+ s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
+ s4[26], -cospi_8_64);
+ s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
+ s4[26], cospi_24_64);
+
+ // stage 6
+ s6[0] = highbd_idct_add_dual(s5[0], s4[7]);
+ s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
+ s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
+ s6[3] = highbd_idct_add_dual(s5[3], s4[4]);
+ s6[4] = highbd_idct_sub_dual(s5[3], s4[4]);
+ s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
+ s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
+ s6[7] = highbd_idct_sub_dual(s5[0], s4[7]);
+
+ s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
+ s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
+
+ s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
+ s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
+
+ s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
+ s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
+ s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
+ s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
+ s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
+ s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
+ s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
+ s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
+ s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
+ s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
+ s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
+ s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
+ s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
+ s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
+ s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
+ s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
+
+ // stage 7
+ s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
+ s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
+ s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
+ s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
+ s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
+ s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
+ s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
+ s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
+ s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
+ s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
+ s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
+ s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
+ s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
+ s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
+ s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
+ s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
+
+ s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
+ s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
+
+ s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
+ s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
+
+ s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
+ s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
+
+ s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
+ s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
+
+ // final stage
+ out[0] = highbd_idct_add_dual(s7[0], s6[31]);
+ out[1] = highbd_idct_add_dual(s7[1], s6[30]);
+ out[2] = highbd_idct_add_dual(s7[2], s6[29]);
+ out[3] = highbd_idct_add_dual(s7[3], s6[28]);
+ out[4] = highbd_idct_add_dual(s7[4], s7[27]);
+ out[5] = highbd_idct_add_dual(s7[5], s7[26]);
+ out[6] = highbd_idct_add_dual(s7[6], s7[25]);
+ out[7] = highbd_idct_add_dual(s7[7], s7[24]);
+ out[8] = highbd_idct_add_dual(s7[8], s7[23]);
+ out[9] = highbd_idct_add_dual(s7[9], s7[22]);
+ out[10] = highbd_idct_add_dual(s7[10], s7[21]);
+ out[11] = highbd_idct_add_dual(s7[11], s7[20]);
+ out[12] = highbd_idct_add_dual(s7[12], s6[19]);
+ out[13] = highbd_idct_add_dual(s7[13], s6[18]);
+ out[14] = highbd_idct_add_dual(s7[14], s6[17]);
+ out[15] = highbd_idct_add_dual(s7[15], s6[16]);
+ out[16] = highbd_idct_sub_dual(s7[15], s6[16]);
+ out[17] = highbd_idct_sub_dual(s7[14], s6[17]);
+ out[18] = highbd_idct_sub_dual(s7[13], s6[18]);
+ out[19] = highbd_idct_sub_dual(s7[12], s6[19]);
+ out[20] = highbd_idct_sub_dual(s7[11], s7[20]);
+ out[21] = highbd_idct_sub_dual(s7[10], s7[21]);
+ out[22] = highbd_idct_sub_dual(s7[9], s7[22]);
+ out[23] = highbd_idct_sub_dual(s7[8], s7[23]);
+ out[24] = highbd_idct_sub_dual(s7[7], s7[24]);
+ out[25] = highbd_idct_sub_dual(s7[6], s7[25]);
+ out[26] = highbd_idct_sub_dual(s7[5], s7[26]);
+ out[27] = highbd_idct_sub_dual(s7[4], s7[27]);
+ out[28] = highbd_idct_sub_dual(s7[3], s6[28]);
+ out[29] = highbd_idct_sub_dual(s7[2], s6[29]);
+ out[30] = highbd_idct_sub_dual(s7[1], s6[30]);
+ out[31] = highbd_idct_sub_dual(s7[0], s6[31]);
+
+ highbd_idct16x16_add_store(out, output, stride, bd);
+ highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
+}
+
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+
+ if (bd == 8) {
+ int16_t temp[32 * 16];
+ int16_t *t = temp;
+ vpx_idct32_12_neon(input, temp);
+ vpx_idct32_12_neon(input + 32 * 8, temp + 8);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_idct32_16_neon(t, dest, stride, 1);
+ t += (16 * 8);
+ dest += 8;
+ }
+ } else {
+ int32_t temp[32 * 16];
+ int32_t *t = temp;
+ vpx_highbd_idct32_12_neon(input, temp);
+ vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_highbd_idct32_16_neon(t, dest, stride, bd);
+ t += (16 * 8);
+ dest += 8;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
new file mode 100644
index 0000000000..f05932cec3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the _34_ variant. Since it only uses values from
+// the top left 8x8 it can safely assume all the remaining values are 0 and skip
+// an awful lot of calculations. In fact, only the first 6 columns make the cut.
+// None of the elements in the 7th or 8th column are used so it skips any calls
+// to input[67] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 8x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7
+// 0 0 2 5 10 17 25
+// 1 1 4 8 15 22 30
+// 2 3 7 12 18 28
+// 3 6 11 16 23 31
+// 4 9 14 19 29
+// 5 13 20 26
+// 6 21 27 33
+// 7 24 32
+static void vpx_highbd_idct32_6_neon(const tran_low_t *input, int32_t *output) {
+ int32x4x2_t in[8], s1[32], s2[32], s3[32];
+
+ in[0].val[0] = vld1q_s32(input);
+ in[0].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[1].val[0] = vld1q_s32(input);
+ in[1].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[2].val[0] = vld1q_s32(input);
+ in[2].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[3].val[0] = vld1q_s32(input);
+ in[3].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[4].val[0] = vld1q_s32(input);
+ in[4].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[5].val[0] = vld1q_s32(input);
+ in[5].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[6].val[0] = vld1q_s32(input);
+ in[6].val[1] = vld1q_s32(input + 4);
+ input += 32;
+ in[7].val[0] = vld1q_s32(input);
+ in[7].val[1] = vld1q_s32(input + 4);
+ transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ // stage 1
+ // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
+ s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+ // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
+ s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+ s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+ s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+ // stage 3
+ s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+ s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+ s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+ s1[31], cospi_28_64);
+ s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+ s1[31], cospi_4_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
+ s1[27], cospi_12_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
+ s1[27], cospi_20_64);
+
+ s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+
+ s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+ s2[15], cospi_24_64);
+ s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+ s2[15], cospi_8_64);
+
+ s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
+ s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
+ s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
+ s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
+ s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
+ s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
+ s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
+ s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
+
+ // stage 5
+ s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
+ s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
+
+ s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], -cospi_8_64,
+ s1[30], cospi_24_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], cospi_24_64,
+ s1[30], cospi_8_64);
+
+ s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_8_64,
+ s1[31], cospi_24_64);
+ s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_24_64,
+ s1[31], cospi_8_64);
+
+ s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
+ s2[27], -cospi_8_64);
+ s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
+ s2[27], cospi_24_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
+ s2[26], -cospi_8_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
+ s2[26], cospi_24_64);
+
+ // stage 6
+ s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
+ s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
+ s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
+ s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
+ s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
+ s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
+ s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
+ s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
+
+ s2[10] = sub_multiply_shift_and_narrow_s32_dual(s2[14], s2[9], cospi_16_64);
+ s2[13] = add_multiply_shift_and_narrow_s32_dual(s2[9], s2[14], cospi_16_64);
+
+ s2[11] = sub_multiply_shift_and_narrow_s32_dual(s2[15], s2[8], cospi_16_64);
+ s2[12] = add_multiply_shift_and_narrow_s32_dual(s2[8], s2[15], cospi_16_64);
+
+ s2[16] = highbd_idct_add_dual(s1[16], s2[23]);
+ s2[17] = highbd_idct_add_dual(s1[17], s2[22]);
+ s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
+ s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
+ s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
+ s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
+ s2[22] = highbd_idct_sub_dual(s1[17], s2[22]);
+ s2[23] = highbd_idct_sub_dual(s1[16], s2[23]);
+
+ s3[24] = highbd_idct_sub_dual(s1[31], s2[24]);
+ s3[25] = highbd_idct_sub_dual(s1[30], s2[25]);
+ s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
+ s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
+ s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
+ s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
+ s2[30] = highbd_idct_add_dual(s2[25], s1[30]);
+ s2[31] = highbd_idct_add_dual(s2[24], s1[31]);
+
+ // stage 7
+ s1[0] = highbd_idct_add_dual(s2[0], s2[15]);
+ s1[1] = highbd_idct_add_dual(s2[1], s2[14]);
+ s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
+ s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
+ s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
+ s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
+ s1[6] = highbd_idct_add_dual(s2[6], s2[9]);
+ s1[7] = highbd_idct_add_dual(s2[7], s2[8]);
+ s1[8] = highbd_idct_sub_dual(s2[7], s2[8]);
+ s1[9] = highbd_idct_sub_dual(s2[6], s2[9]);
+ s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
+ s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
+ s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
+ s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
+ s1[14] = highbd_idct_sub_dual(s2[1], s2[14]);
+ s1[15] = highbd_idct_sub_dual(s2[0], s2[15]);
+
+ s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
+ s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
+
+ s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
+ s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
+
+ s1[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s2[22], cospi_16_64);
+ s1[25] = add_multiply_shift_and_narrow_s32_dual(s2[22], s3[25], cospi_16_64);
+
+ s1[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s2[23], cospi_16_64);
+ s1[24] = add_multiply_shift_and_narrow_s32_dual(s2[23], s3[24], cospi_16_64);
+
+ // final stage
+ s3[0] = highbd_idct_add_dual(s1[0], s2[31]);
+ s3[1] = highbd_idct_add_dual(s1[1], s2[30]);
+ s3[2] = highbd_idct_add_dual(s1[2], s2[29]);
+ s3[3] = highbd_idct_add_dual(s1[3], s2[28]);
+ s3[4] = highbd_idct_add_dual(s1[4], s1[27]);
+ s3[5] = highbd_idct_add_dual(s1[5], s1[26]);
+ s3[6] = highbd_idct_add_dual(s1[6], s1[25]);
+ s3[7] = highbd_idct_add_dual(s1[7], s1[24]);
+ s3[8] = highbd_idct_add_dual(s1[8], s1[23]);
+ s3[9] = highbd_idct_add_dual(s1[9], s1[22]);
+ s3[10] = highbd_idct_add_dual(s1[10], s1[21]);
+ s3[11] = highbd_idct_add_dual(s1[11], s1[20]);
+ s3[12] = highbd_idct_add_dual(s1[12], s2[19]);
+ s3[13] = highbd_idct_add_dual(s1[13], s2[18]);
+ s3[14] = highbd_idct_add_dual(s1[14], s2[17]);
+ s3[15] = highbd_idct_add_dual(s1[15], s2[16]);
+ s3[16] = highbd_idct_sub_dual(s1[15], s2[16]);
+ s3[17] = highbd_idct_sub_dual(s1[14], s2[17]);
+ s3[18] = highbd_idct_sub_dual(s1[13], s2[18]);
+ s3[19] = highbd_idct_sub_dual(s1[12], s2[19]);
+ s3[20] = highbd_idct_sub_dual(s1[11], s1[20]);
+ s3[21] = highbd_idct_sub_dual(s1[10], s1[21]);
+ s3[22] = highbd_idct_sub_dual(s1[9], s1[22]);
+ s3[23] = highbd_idct_sub_dual(s1[8], s1[23]);
+ s3[24] = highbd_idct_sub_dual(s1[7], s1[24]);
+ s3[25] = highbd_idct_sub_dual(s1[6], s1[25]);
+ s3[26] = highbd_idct_sub_dual(s1[5], s1[26]);
+ s3[27] = highbd_idct_sub_dual(s1[4], s1[27]);
+ s3[28] = highbd_idct_sub_dual(s1[3], s2[28]);
+ s3[29] = highbd_idct_sub_dual(s1[2], s2[29]);
+ s3[30] = highbd_idct_sub_dual(s1[1], s2[30]);
+ s3[31] = highbd_idct_sub_dual(s1[0], s2[31]);
+
+ vst1q_s32(output, s3[0].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[0].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[1].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[1].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[2].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[2].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[3].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[3].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[4].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[4].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[5].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[5].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[6].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[6].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[7].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[7].val[1]);
+ output += 4;
+
+ vst1q_s32(output, s3[8].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[8].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[9].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[9].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[10].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[10].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[11].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[11].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[12].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[12].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[13].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[13].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[14].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[14].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[15].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[15].val[1]);
+ output += 4;
+
+ vst1q_s32(output, s3[16].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[16].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[17].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[17].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[18].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[18].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[19].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[19].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[20].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[20].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[21].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[21].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[22].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[22].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[23].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[23].val[1]);
+ output += 4;
+
+ vst1q_s32(output, s3[24].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[24].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[25].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[25].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[26].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[26].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[27].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[27].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[28].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[28].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[29].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[29].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[30].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[30].val[1]);
+ output += 4;
+ vst1q_s32(output, s3[31].val[0]);
+ output += 4;
+ vst1q_s32(output, s3[31].val[1]);
+}
+
+static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output,
+ int stride, const int bd) {
+ int32x4x2_t in[8], s1[32], s2[32], s3[32], out[32];
+
+ load_and_transpose_s32_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
+ &in[5], &in[6], &in[7]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+ // Different for _8_
+ s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+ s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+ s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+ // stage 3
+ s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+ s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+ s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+ s1[31], cospi_28_64);
+ s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+ s1[31], cospi_4_64);
+
+ // Different for _8_
+ s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_28_64,
+ s1[28], -cospi_4_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_4_64,
+ s1[28], cospi_28_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
+ s1[27], cospi_12_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
+ s1[27], cospi_20_64);
+
+ s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+
+ s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+ s2[15], cospi_24_64);
+ s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+ s2[15], cospi_8_64);
+
+ s2[10] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_24_64,
+ s2[12], -cospi_8_64);
+ s2[13] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_8_64,
+ s2[12], cospi_24_64);
+
+ s2[16] = highbd_idct_add_dual(s1[16], s1[19]);
+
+ s2[17] = highbd_idct_add_dual(s1[17], s1[18]);
+ s2[18] = highbd_idct_sub_dual(s1[17], s1[18]);
+
+ s2[19] = highbd_idct_sub_dual(s1[16], s1[19]);
+
+ s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
+ s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
+
+ s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
+ s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
+
+ s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
+ s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
+ s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
+ s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
+
+ s2[28] = highbd_idct_sub_dual(s1[31], s1[28]);
+ s2[29] = highbd_idct_sub_dual(s1[30], s1[29]);
+ s2[30] = highbd_idct_add_dual(s1[29], s1[30]);
+ s2[31] = highbd_idct_add_dual(s1[28], s1[31]);
+
+ // stage 5
+ s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
+ s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
+
+ s1[8] = highbd_idct_add_dual(s2[8], s2[11]);
+ s1[9] = highbd_idct_add_dual(s2[9], s2[10]);
+ s1[10] = highbd_idct_sub_dual(s2[9], s2[10]);
+ s1[11] = highbd_idct_sub_dual(s2[8], s2[11]);
+ s1[12] = highbd_idct_sub_dual(s2[15], s2[12]);
+ s1[13] = highbd_idct_sub_dual(s2[14], s2[13]);
+ s1[14] = highbd_idct_add_dual(s2[13], s2[14]);
+ s1[15] = highbd_idct_add_dual(s2[12], s2[15]);
+
+ s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_8_64,
+ s2[29], cospi_24_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], cospi_24_64,
+ s2[29], cospi_8_64);
+
+ s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], -cospi_8_64,
+ s2[28], cospi_24_64);
+ s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], cospi_24_64,
+ s2[28], cospi_8_64);
+
+ s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
+ s2[27], -cospi_8_64);
+ s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
+ s2[27], cospi_24_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
+ s2[26], -cospi_8_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
+ s2[26], cospi_24_64);
+
+ // stage 6
+ s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
+ s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
+ s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
+ s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
+ s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
+ s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
+ s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
+ s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
+
+ s2[10] = sub_multiply_shift_and_narrow_s32_dual(s1[13], s1[10], cospi_16_64);
+ s2[13] = add_multiply_shift_and_narrow_s32_dual(s1[10], s1[13], cospi_16_64);
+
+ s2[11] = sub_multiply_shift_and_narrow_s32_dual(s1[12], s1[11], cospi_16_64);
+ s2[12] = add_multiply_shift_and_narrow_s32_dual(s1[11], s1[12], cospi_16_64);
+
+ s1[16] = highbd_idct_add_dual(s2[16], s2[23]);
+ s1[17] = highbd_idct_add_dual(s2[17], s2[22]);
+ s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
+ s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
+ s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
+ s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
+ s1[22] = highbd_idct_sub_dual(s2[17], s2[22]);
+ s1[23] = highbd_idct_sub_dual(s2[16], s2[23]);
+
+ s3[24] = highbd_idct_sub_dual(s2[31], s2[24]);
+ s3[25] = highbd_idct_sub_dual(s2[30], s2[25]);
+ s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
+ s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
+ s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
+ s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
+ s2[30] = highbd_idct_add_dual(s2[25], s2[30]);
+ s2[31] = highbd_idct_add_dual(s2[24], s2[31]);
+
+ // stage 7
+ s1[0] = highbd_idct_add_dual(s2[0], s1[15]);
+ s1[1] = highbd_idct_add_dual(s2[1], s1[14]);
+ s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
+ s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
+ s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
+ s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
+ s1[6] = highbd_idct_add_dual(s2[6], s1[9]);
+ s1[7] = highbd_idct_add_dual(s2[7], s1[8]);
+ s1[8] = highbd_idct_sub_dual(s2[7], s1[8]);
+ s1[9] = highbd_idct_sub_dual(s2[6], s1[9]);
+ s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
+ s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
+ s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
+ s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
+ s1[14] = highbd_idct_sub_dual(s2[1], s1[14]);
+ s1[15] = highbd_idct_sub_dual(s2[0], s1[15]);
+
+ s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
+ s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
+
+ s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
+ s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
+
+ s2[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s1[22], cospi_16_64);
+ s1[25] = add_multiply_shift_and_narrow_s32_dual(s1[22], s3[25], cospi_16_64);
+
+ s2[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s1[23], cospi_16_64);
+ s1[24] = add_multiply_shift_and_narrow_s32_dual(s1[23], s3[24], cospi_16_64);
+
+ // final stage
+ out[0] = highbd_idct_add_dual(s1[0], s2[31]);
+ out[1] = highbd_idct_add_dual(s1[1], s2[30]);
+ out[2] = highbd_idct_add_dual(s1[2], s2[29]);
+ out[3] = highbd_idct_add_dual(s1[3], s2[28]);
+ out[4] = highbd_idct_add_dual(s1[4], s1[27]);
+ out[5] = highbd_idct_add_dual(s1[5], s1[26]);
+ out[6] = highbd_idct_add_dual(s1[6], s1[25]);
+ out[7] = highbd_idct_add_dual(s1[7], s1[24]);
+ out[8] = highbd_idct_add_dual(s1[8], s2[23]);
+ out[9] = highbd_idct_add_dual(s1[9], s2[22]);
+ out[10] = highbd_idct_add_dual(s1[10], s1[21]);
+ out[11] = highbd_idct_add_dual(s1[11], s1[20]);
+ out[12] = highbd_idct_add_dual(s1[12], s2[19]);
+ out[13] = highbd_idct_add_dual(s1[13], s2[18]);
+ out[14] = highbd_idct_add_dual(s1[14], s1[17]);
+ out[15] = highbd_idct_add_dual(s1[15], s1[16]);
+ out[16] = highbd_idct_sub_dual(s1[15], s1[16]);
+ out[17] = highbd_idct_sub_dual(s1[14], s1[17]);
+ out[18] = highbd_idct_sub_dual(s1[13], s2[18]);
+ out[19] = highbd_idct_sub_dual(s1[12], s2[19]);
+ out[20] = highbd_idct_sub_dual(s1[11], s1[20]);
+ out[21] = highbd_idct_sub_dual(s1[10], s1[21]);
+ out[22] = highbd_idct_sub_dual(s1[9], s2[22]);
+ out[23] = highbd_idct_sub_dual(s1[8], s2[23]);
+ out[24] = highbd_idct_sub_dual(s1[7], s1[24]);
+ out[25] = highbd_idct_sub_dual(s1[6], s1[25]);
+ out[26] = highbd_idct_sub_dual(s1[5], s1[26]);
+ out[27] = highbd_idct_sub_dual(s1[4], s1[27]);
+ out[28] = highbd_idct_sub_dual(s1[3], s2[28]);
+ out[29] = highbd_idct_sub_dual(s1[2], s2[29]);
+ out[30] = highbd_idct_sub_dual(s1[1], s2[30]);
+ out[31] = highbd_idct_sub_dual(s1[0], s2[31]);
+
+ highbd_idct16x16_add_store(out, output, stride, bd);
+ highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
+}
+
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int i;
+
+ if (bd == 8) {
+ int16_t temp[32 * 8];
+ int16_t *t = temp;
+
+ vpx_idct32_6_neon(input, t);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_idct32_8_neon(t, dest, stride, 1);
+ t += (8 * 8);
+ dest += 8;
+ }
+ } else {
+ int32_t temp[32 * 8];
+ int32_t *t = temp;
+
+ vpx_highbd_idct32_6_neon(input, t);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_highbd_idct32_8_neon(t, dest, stride, bd);
+ t += (8 * 8);
+ dest += 8;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
new file mode 100644
index 0000000000..c1354c0c1a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a0 = vld1q_u16(*dest);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const uint16x8_t a2 = vld1q_u16(*dest + 16);
+ const uint16x8_t a3 = vld1q_u16(*dest + 24);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+ const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+ const int16x8_t c0 = vminq_s16(b0, max);
+ const int16x8_t c1 = vminq_s16(b1, max);
+ const int16x8_t c2 = vminq_s16(b2, max);
+ const int16x8_t c3 = vminq_s16(b3, max);
+ vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+ vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+ vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
+ vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a0 = vld1q_u16(*dest);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const uint16x8_t a2 = vld1q_u16(*dest + 16);
+ const uint16x8_t a3 = vld1q_u16(*dest + 24);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+ const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+ const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+ const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+ const uint16x8_t c2 = vqshluq_n_s16(b2, 0);
+ const uint16x8_t c3 = vqshluq_n_s16(b3, 0);
+ vst1q_u16(*dest, c0);
+ vst1q_u16(*dest + 8, c1);
+ vst1q_u16(*dest + 16, c2);
+ vst1q_u16(*dest + 24, c3);
+ *dest += stride;
+}
+
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ int i;
+
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ }
+ } else {
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
new file mode 100644
index 0000000000..7be1dad1d3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+// res is in reverse row order
+static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x4_t a0 = vld1_u16(*dest);
+ const uint16x4_t a1 = vld1_u16(*dest + stride);
+ const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0));
+ // Note: In some profile tests, res is quite close to +/-32767.
+ // We use saturating addition.
+ const int16x8_t b = vqaddq_s16(res, a);
+ const int16x8_t c = vminq_s16(b, max);
+ const uint16x8_t d = vqshluq_n_s16(c, 0);
+ vst1_u16(*dest, vget_high_u16(d));
+ *dest += stride;
+ vst1_u16(*dest, vget_low_u16(d));
+ *dest += stride;
+}
+
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+ const int16x8_t dc = vdupq_n_s16(a1);
+
+ highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
+ highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
+}
+
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ int16x8_t a[2];
+ int32x4_t c[4];
+
+ c[0] = vld1q_s32(input);
+ c[1] = vld1q_s32(input + 4);
+ c[2] = vld1q_s32(input + 8);
+ c[3] = vld1q_s32(input + 12);
+
+ if (bd == 8) {
+ // Rows
+ a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
+ a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
+ transpose_idct4x4_16_bd8(a);
+
+ // Columns
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+ transpose_idct4x4_16_bd8(a);
+ a[0] = vrshrq_n_s16(a[0], 4);
+ a[1] = vrshrq_n_s16(a[1], 4);
+ } else {
+ const int32x4_t cospis = vld1q_s32(kCospi32);
+
+ if (bd == 10) {
+ idct4x4_16_kernel_bd10(cospis, c);
+ idct4x4_16_kernel_bd10(cospis, c);
+ } else {
+ idct4x4_16_kernel_bd12(cospis, c);
+ idct4x4_16_kernel_bd12(cospis, c);
+ }
+ a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
+ a[1] = vcombine_s16(vqrshrn_n_s32(c[3], 4), vqrshrn_n_s32(c[2], 4));
+ }
+
+ highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
+ highbd_idct4x4_1_add_kernel2(&dest, stride, a[1], max);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
new file mode 100644
index 0000000000..bed3227ca7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a = vld1q_u16(*dest);
+ const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+ const int16x8_t c = vminq_s16(b, max);
+ vst1q_u16(*dest, vreinterpretq_u16_s16(c));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a = vld1q_u16(*dest);
+ const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+ const uint16x8_t c = vqshluq_n_s16(b, 0);
+ vst1q_u16(*dest, c);
+ *dest += stride;
+}
+
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const tran_low_t out0 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+ const tran_low_t out1 = HIGHBD_WRAPLOW(
+ dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
+ const int16x8_t dc = vdupq_n_s16(a1);
+
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ } else {
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ }
+}
+
+static INLINE void idct8x8_12_half1d_bd10(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x4_t step1[8], step2[8];
+
+ transpose_s32_4x4(io0, io1, io2, io3);
+
+ // stage 1
+ step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+ step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+ step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+ step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+ step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+ step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
+
+ // stage 2
+ step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+ step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+ step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+ step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+ step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+ step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[1], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[1], step2[3]);
+
+ step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+ step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_half1d_bd12(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x2_t input1l, input1h, input3l, input3h;
+ int32x2_t step1l[2], step1h[2];
+ int32x4_t step1[8], step2[8];
+ int64x2_t t64[8];
+ int32x2_t t32[8];
+
+ transpose_s32_4x4(io0, io1, io2, io3);
+
+ // stage 1
+ input1l = vget_low_s32(*io1);
+ input1h = vget_high_s32(*io1);
+ input3l = vget_low_s32(*io3);
+ input3h = vget_high_s32(*io3);
+ step1l[0] = vget_low_s32(*io0);
+ step1h[0] = vget_high_s32(*io0);
+ step1l[1] = vget_low_s32(*io2);
+ step1h[1] = vget_high_s32(*io2);
+
+ t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
+ t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
+ t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
+ t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
+ t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
+ t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
+ t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
+ t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+ step1[4] = vcombine_s32(t32[0], t32[1]);
+ step1[5] = vcombine_s32(t32[2], t32[3]);
+ step1[6] = vcombine_s32(t32[4], t32[5]);
+ step1[7] = vcombine_s32(t32[6], t32[7]);
+
+ // stage 2
+ t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+ t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+ t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+ t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+ t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+ step2[1] = vcombine_s32(t32[2], t32[3]);
+ step2[2] = vcombine_s32(t32[4], t32[5]);
+ step2[3] = vcombine_s32(t32[6], t32[7]);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[1], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[1], step2[3]);
+
+ t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[0] =
+ vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t64[2] =
+ vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ step1[5] = vcombine_s32(t32[0], t32[1]);
+ step1[6] = vcombine_s32(t32[2], t32[3]);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int32x4_t a[16];
+ int16x8_t c[8];
+
+ a[0] = vld1q_s32(input);
+ a[1] = vld1q_s32(input + 8);
+ a[2] = vld1q_s32(input + 16);
+ a[3] = vld1q_s32(input + 24);
+
+ if (bd == 8) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x8_t cospisd = vaddq_s16(cospis, cospis);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24
+ const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28
+ int16x4_t b[8];
+
+ b[0] = vmovn_s32(a[0]);
+ b[1] = vmovn_s32(a[1]);
+ b[2] = vmovn_s32(a[2]);
+ b[3] = vmovn_s32(a[3]);
+
+ idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, b);
+ idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b, c);
+ c[0] = vrshrq_n_s16(c[0], 5);
+ c[1] = vrshrq_n_s16(c[1], 5);
+ c[2] = vrshrq_n_s16(c[2], 5);
+ c[3] = vrshrq_n_s16(c[3], 5);
+ c[4] = vrshrq_n_s16(c[4], 5);
+ c[5] = vrshrq_n_s16(c[5], 5);
+ c[6] = vrshrq_n_s16(c[6], 5);
+ c[7] = vrshrq_n_s16(c[7], 5);
+ } else {
+ const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
+ const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
+
+ if (bd == 10) {
+ idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[8], &a[9], &a[10], &a[11]);
+ idct8x8_12_half1d_bd10(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
+ &a[12], &a[13], &a[14], &a[15]);
+ } else {
+ idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[8], &a[9], &a[10], &a[11]);
+ idct8x8_12_half1d_bd12(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
+ &a[12], &a[13], &a[14], &a[15]);
+ }
+ c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+ c[1] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+ c[2] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+ c[3] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+ c[4] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+ c[5] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+ c[6] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+ c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
+ }
+ highbd_add8x8(c, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ int32x4_t a[16];
+ int16x8_t c[8];
+
+ a[0] = vld1q_s32(input);
+ a[1] = vld1q_s32(input + 4);
+ a[2] = vld1q_s32(input + 8);
+ a[3] = vld1q_s32(input + 12);
+ a[4] = vld1q_s32(input + 16);
+ a[5] = vld1q_s32(input + 20);
+ a[6] = vld1q_s32(input + 24);
+ a[7] = vld1q_s32(input + 28);
+ a[8] = vld1q_s32(input + 32);
+ a[9] = vld1q_s32(input + 36);
+ a[10] = vld1q_s32(input + 40);
+ a[11] = vld1q_s32(input + 44);
+ a[12] = vld1q_s32(input + 48);
+ a[13] = vld1q_s32(input + 52);
+ a[14] = vld1q_s32(input + 56);
+ a[15] = vld1q_s32(input + 60);
+
+ if (bd == 8) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
+ int16x8_t b[8];
+
+ b[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
+ b[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
+ b[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
+ b[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
+ b[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
+ b[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
+ b[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
+ b[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));
+
+ idct8x8_64_1d_bd8(cospis0, cospis1, b);
+ idct8x8_64_1d_bd8(cospis0, cospis1, b);
+
+ c[0] = vrshrq_n_s16(b[0], 5);
+ c[1] = vrshrq_n_s16(b[1], 5);
+ c[2] = vrshrq_n_s16(b[2], 5);
+ c[3] = vrshrq_n_s16(b[3], 5);
+ c[4] = vrshrq_n_s16(b[4], 5);
+ c[5] = vrshrq_n_s16(b[5], 5);
+ c[6] = vrshrq_n_s16(b[6], 5);
+ c[7] = vrshrq_n_s16(b[7], 5);
+ } else {
+ const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
+ const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
+
+ if (bd == 10) {
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+ &a[12], &a[13], &a[14], &a[15]);
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+ &a[2], &a[10], &a[3], &a[11]);
+ idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+ &a[6], &a[14], &a[7], &a[15]);
+ } else {
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+ &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+ &a[12], &a[13], &a[14], &a[15]);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+ &a[2], &a[10], &a[3], &a[11]);
+ idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+ &a[6], &a[14], &a[7], &a[15]);
+ }
+ c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+ c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+ c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+ c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+ c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+ c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+ c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+ c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
+ }
+ highbd_add8x8(c, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h
new file mode 100644
index 0000000000..518ef4336e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x4_t a0 = vld1_u16(*dest);
+ const uint16x4_t a1 = vld1_u16(*dest + stride);
+ const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
+ // Note: In some profile tests, res is quite close to +/-32767.
+ // We use saturating addition.
+ const int16x8_t b = vqaddq_s16(res, a);
+ const int16x8_t c = vminq_s16(b, max);
+ const uint16x8_t d = vqshluq_n_s16(c, 0);
+ vst1_u16(*dest, vget_low_u16(d));
+ *dest += stride;
+ vst1_u16(*dest, vget_high_u16(d));
+ *dest += stride;
+}
+
+static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
+ int32x4_t *const a) {
+ int32x4_t b0, b1, b2, b3;
+
+ transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
+ b0 = vaddq_s32(a[0], a[2]);
+ b1 = vsubq_s32(a[0], a[2]);
+ b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
+ b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
+ b2 = vmulq_lane_s32(a[1], vget_high_s32(cospis), 1);
+ b3 = vmulq_lane_s32(a[1], vget_low_s32(cospis), 1);
+ b2 = vmlsq_lane_s32(b2, a[3], vget_low_s32(cospis), 1);
+ b3 = vmlaq_lane_s32(b3, a[3], vget_high_s32(cospis), 1);
+ b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
+ b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
+ b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
+ b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
+ a[0] = vaddq_s32(b0, b3);
+ a[1] = vaddq_s32(b1, b2);
+ a[2] = vsubq_s32(b1, b2);
+ a[3] = vsubq_s32(b0, b3);
+}
+
+static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
+ int32x4_t *const a) {
+ int32x4_t b0, b1, b2, b3;
+ int64x2_t c[12];
+
+ transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
+ b0 = vaddq_s32(a[0], a[2]);
+ b1 = vsubq_s32(a[0], a[2]);
+ c[0] = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
+ c[1] = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
+ c[2] = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
+ c[3] = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
+ c[4] = vmull_lane_s32(vget_low_s32(a[1]), vget_high_s32(cospis), 1);
+ c[5] = vmull_lane_s32(vget_high_s32(a[1]), vget_high_s32(cospis), 1);
+ c[6] = vmull_lane_s32(vget_low_s32(a[1]), vget_low_s32(cospis), 1);
+ c[7] = vmull_lane_s32(vget_high_s32(a[1]), vget_low_s32(cospis), 1);
+ c[8] = vmull_lane_s32(vget_low_s32(a[3]), vget_low_s32(cospis), 1);
+ c[9] = vmull_lane_s32(vget_high_s32(a[3]), vget_low_s32(cospis), 1);
+ c[10] = vmull_lane_s32(vget_low_s32(a[3]), vget_high_s32(cospis), 1);
+ c[11] = vmull_lane_s32(vget_high_s32(a[3]), vget_high_s32(cospis), 1);
+ c[4] = vsubq_s64(c[4], c[8]);
+ c[5] = vsubq_s64(c[5], c[9]);
+ c[6] = vaddq_s64(c[6], c[10]);
+ c[7] = vaddq_s64(c[7], c[11]);
+ b0 = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
+ vrshrn_n_s64(c[1], DCT_CONST_BITS));
+ b1 = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
+ vrshrn_n_s64(c[3], DCT_CONST_BITS));
+ b2 = vcombine_s32(vrshrn_n_s64(c[4], DCT_CONST_BITS),
+ vrshrn_n_s64(c[5], DCT_CONST_BITS));
+ b3 = vcombine_s32(vrshrn_n_s64(c[6], DCT_CONST_BITS),
+ vrshrn_n_s64(c[7], DCT_CONST_BITS));
+ a[0] = vaddq_s32(b0, b3);
+ a[1] = vaddq_s32(b1, b2);
+ a[2] = vsubq_s32(b1, b2);
+ a[3] = vsubq_s32(b0, b3);
+}
+
+static INLINE void highbd_add8x8(int16x8_t *const a, uint16_t *dest,
+ const int stride, const int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ const uint16_t *dst = dest;
+ uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+ uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
+ int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
+
+ d0 = vld1q_u16(dst);
+ dst += stride;
+ d1 = vld1q_u16(dst);
+ dst += stride;
+ d2 = vld1q_u16(dst);
+ dst += stride;
+ d3 = vld1q_u16(dst);
+ dst += stride;
+ d4 = vld1q_u16(dst);
+ dst += stride;
+ d5 = vld1q_u16(dst);
+ dst += stride;
+ d6 = vld1q_u16(dst);
+ dst += stride;
+ d7 = vld1q_u16(dst);
+
+ d0_s16 = vqaddq_s16(a[0], vreinterpretq_s16_u16(d0));
+ d1_s16 = vqaddq_s16(a[1], vreinterpretq_s16_u16(d1));
+ d2_s16 = vqaddq_s16(a[2], vreinterpretq_s16_u16(d2));
+ d3_s16 = vqaddq_s16(a[3], vreinterpretq_s16_u16(d3));
+ d4_s16 = vqaddq_s16(a[4], vreinterpretq_s16_u16(d4));
+ d5_s16 = vqaddq_s16(a[5], vreinterpretq_s16_u16(d5));
+ d6_s16 = vqaddq_s16(a[6], vreinterpretq_s16_u16(d6));
+ d7_s16 = vqaddq_s16(a[7], vreinterpretq_s16_u16(d7));
+
+ d0_s16 = vminq_s16(d0_s16, max);
+ d1_s16 = vminq_s16(d1_s16, max);
+ d2_s16 = vminq_s16(d2_s16, max);
+ d3_s16 = vminq_s16(d3_s16, max);
+ d4_s16 = vminq_s16(d4_s16, max);
+ d5_s16 = vminq_s16(d5_s16, max);
+ d6_s16 = vminq_s16(d6_s16, max);
+ d7_s16 = vminq_s16(d7_s16, max);
+ d0_u16 = vqshluq_n_s16(d0_s16, 0);
+ d1_u16 = vqshluq_n_s16(d1_s16, 0);
+ d2_u16 = vqshluq_n_s16(d2_s16, 0);
+ d3_u16 = vqshluq_n_s16(d3_s16, 0);
+ d4_u16 = vqshluq_n_s16(d4_s16, 0);
+ d5_u16 = vqshluq_n_s16(d5_s16, 0);
+ d6_u16 = vqshluq_n_s16(d6_s16, 0);
+ d7_u16 = vqshluq_n_s16(d7_s16, 0);
+
+ vst1q_u16(dest, d0_u16);
+ dest += stride;
+ vst1q_u16(dest, d1_u16);
+ dest += stride;
+ vst1q_u16(dest, d2_u16);
+ dest += stride;
+ vst1q_u16(dest, d3_u16);
+ dest += stride;
+ vst1q_u16(dest, d4_u16);
+ dest += stride;
+ vst1q_u16(dest, d5_u16);
+ dest += stride;
+ vst1q_u16(dest, d6_u16);
+ dest += stride;
+ vst1q_u16(dest, d7_u16);
+}
+
+static INLINE void idct8x8_64_half1d_bd10(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x4_t step1[8], step2[8];
+
+ transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+ // stage 1
+ step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+ step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+ step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+ step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+
+ step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
+ step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
+ step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
+ step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
+
+ step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+ step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
+
+ // stage 2
+ step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+ step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+ step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+
+ step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+ step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+ step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
+ step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
+
+ step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
+ step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+ step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+ step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[0], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[0], step2[3]);
+
+ step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+ step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_half1d_bd12(
+ const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+ int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+ int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+ int32x4_t *const io7) {
+ int32x2_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
+ input7h;
+ int32x2_t step1l[4], step1h[4];
+ int32x4_t step1[8], step2[8];
+ int64x2_t t64[8];
+ int32x2_t t32[8];
+
+ transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+ // stage 1
+ input1l = vget_low_s32(*io1);
+ input1h = vget_high_s32(*io1);
+ input3l = vget_low_s32(*io3);
+ input3h = vget_high_s32(*io3);
+ input5l = vget_low_s32(*io5);
+ input5h = vget_high_s32(*io5);
+ input7l = vget_low_s32(*io7);
+ input7h = vget_high_s32(*io7);
+ step1l[0] = vget_low_s32(*io0);
+ step1h[0] = vget_high_s32(*io0);
+ step1l[1] = vget_low_s32(*io2);
+ step1h[1] = vget_high_s32(*io2);
+ step1l[2] = vget_low_s32(*io4);
+ step1h[2] = vget_high_s32(*io4);
+ step1l[3] = vget_low_s32(*io6);
+ step1h[3] = vget_high_s32(*io6);
+
+ t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
+ t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
+ t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
+ t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
+ t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
+ t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
+ t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
+ t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
+ t64[0] = vmlsl_lane_s32(t64[0], input7l, vget_low_s32(cospis1), 0);
+ t64[1] = vmlsl_lane_s32(t64[1], input7h, vget_low_s32(cospis1), 0);
+ t64[2] = vmlal_lane_s32(t64[2], input5l, vget_low_s32(cospis1), 1);
+ t64[3] = vmlal_lane_s32(t64[3], input5h, vget_low_s32(cospis1), 1);
+ t64[4] = vmlsl_lane_s32(t64[4], input5l, vget_high_s32(cospis1), 0);
+ t64[5] = vmlsl_lane_s32(t64[5], input5h, vget_high_s32(cospis1), 0);
+ t64[6] = vmlal_lane_s32(t64[6], input7l, vget_high_s32(cospis1), 1);
+ t64[7] = vmlal_lane_s32(t64[7], input7h, vget_high_s32(cospis1), 1);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+ step1[4] = vcombine_s32(t32[0], t32[1]);
+ step1[5] = vcombine_s32(t32[2], t32[3]);
+ step1[6] = vcombine_s32(t32[4], t32[5]);
+ step1[7] = vcombine_s32(t32[6], t32[7]);
+
+ // stage 2
+ t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+ t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+ t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+ t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+ t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+ t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+ t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+ t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+ t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+ t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
+ t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
+ t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
+ t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+ step2[0] = vcombine_s32(t32[0], t32[1]);
+ step2[1] = vcombine_s32(t32[2], t32[3]);
+ step2[2] = vcombine_s32(t32[4], t32[5]);
+ step2[3] = vcombine_s32(t32[6], t32[7]);
+
+ step2[4] = vaddq_s32(step1[4], step1[5]);
+ step2[5] = vsubq_s32(step1[4], step1[5]);
+ step2[6] = vsubq_s32(step1[7], step1[6]);
+ step2[7] = vaddq_s32(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s32(step2[0], step2[3]);
+ step1[1] = vaddq_s32(step2[1], step2[2]);
+ step1[2] = vsubq_s32(step2[1], step2[2]);
+ step1[3] = vsubq_s32(step2[0], step2[3]);
+
+ t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+ t64[0] =
+ vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t64[2] =
+ vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+ t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+ vget_high_s32(cospis0), 0);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ step1[5] = vcombine_s32(t32[0], t32[1]);
+ step1[6] = vcombine_s32(t32[2], t32[3]);
+
+ // stage 4
+ *io0 = vaddq_s32(step1[0], step2[7]);
+ *io1 = vaddq_s32(step1[1], step1[6]);
+ *io2 = vaddq_s32(step1[2], step1[5]);
+ *io3 = vaddq_s32(step1[3], step2[4]);
+ *io4 = vsubq_s32(step1[3], step2[4]);
+ *io5 = vsubq_s32(step1[2], step1[5]);
+ *io6 = vsubq_s32(step1[1], step1[6]);
+ *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out,
+ int32_t *output) {
+ // Save the result into output
+ vst1q_s32(output + 0, out[0].val[0]);
+ vst1q_s32(output + 4, out[0].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[1].val[0]);
+ vst1q_s32(output + 4, out[1].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[2].val[0]);
+ vst1q_s32(output + 4, out[2].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[3].val[0]);
+ vst1q_s32(output + 4, out[3].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[4].val[0]);
+ vst1q_s32(output + 4, out[4].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[5].val[0]);
+ vst1q_s32(output + 4, out[5].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[6].val[0]);
+ vst1q_s32(output + 4, out[6].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[7].val[0]);
+ vst1q_s32(output + 4, out[7].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[8].val[0]);
+ vst1q_s32(output + 4, out[8].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[9].val[0]);
+ vst1q_s32(output + 4, out[9].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[10].val[0]);
+ vst1q_s32(output + 4, out[10].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[11].val[0]);
+ vst1q_s32(output + 4, out[11].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[12].val[0]);
+ vst1q_s32(output + 4, out[12].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[13].val[0]);
+ vst1q_s32(output + 4, out[13].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[14].val[0]);
+ vst1q_s32(output + 4, out[14].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[15].val[0]);
+ vst1q_s32(output + 4, out[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
+ uint16_t *dest, const int stride,
+ const int bd) {
+ // Add the result to dest
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ int16x8_t o[16];
+ o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
+ vrshrn_n_s32(out[0].val[1], 6));
+ o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
+ vrshrn_n_s32(out[1].val[1], 6));
+ o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
+ vrshrn_n_s32(out[2].val[1], 6));
+ o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
+ vrshrn_n_s32(out[3].val[1], 6));
+ o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
+ vrshrn_n_s32(out[4].val[1], 6));
+ o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
+ vrshrn_n_s32(out[5].val[1], 6));
+ o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
+ vrshrn_n_s32(out[6].val[1], 6));
+ o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
+ vrshrn_n_s32(out[7].val[1], 6));
+ o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
+ vrshrn_n_s32(out[8].val[1], 6));
+ o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
+ vrshrn_n_s32(out[9].val[1], 6));
+ o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
+ vrshrn_n_s32(out[10].val[1], 6));
+ o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
+ vrshrn_n_s32(out[11].val[1], 6));
+ o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
+ vrshrn_n_s32(out[12].val[1], 6));
+ o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
+ vrshrn_n_s32(out[13].val[1], 6));
+ o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
+ vrshrn_n_s32(out[14].val[1], 6));
+ o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
+ vrshrn_n_s32(out[15].val[1], 6));
+ highbd_idct16x16_add8x1(o[0], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[1], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[2], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[3], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[4], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[5], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[6], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[7], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[8], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[9], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[10], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[11], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[12], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[13], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[14], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[15], max, &dest, stride);
+}
+
+void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
+ uint16_t *dest, const int stride,
+ const int bd);
+
+#endif // VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 0000000000..235cb5b996
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,2514 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "sum_neon.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16_t dc_sum_4(const uint16_t *ref) {
+ const uint16x4_t ref_u16 = vld1_u16(ref);
+ return horizontal_add_uint16x4(ref_u16);
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ const uint16x4_t dc) {
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ vst1_u16(dst, dc);
+ }
+}
+
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t a = vld1_u16(above);
+ const uint16x4_t l = vld1_u16(left);
+ const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l));
+ const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3);
+ (void)bd;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_4(left);
+ const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
+ (void)above;
+ (void)bd;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_4(above);
+ const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16_t dc_sum_8(const uint16_t *ref) {
+ const uint16x8_t ref_u16 = vld1q_u16(ref);
+ return horizontal_add_uint16x8(ref_u16);
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+ const uint16x8_t dc) {
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ vst1q_u16(dst, dc);
+ }
+}
+
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t above_u16 = vld1q_u16(above);
+ const uint16x8_t left_u16 = vld1q_u16(left);
+ const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
+ const uint16_t sum = horizontal_add_uint16x8(p0);
+ const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
+ (void)bd;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_8(left);
+ const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
+ (void)above;
+ (void)bd;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_8(above);
+ const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
+ (void)left;
+ (void)bd;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_8x8(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16_t dc_sum_16(const uint16_t *ref) {
+ const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0);
+ const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8);
+ const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1);
+ return horizontal_add_uint16x8(p0);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+ const uint16x8_t dc) {
+ int i;
+ for (i = 0; i < 16; ++i, dst += stride) {
+ vst1q_u16(dst + 0, dc);
+ vst1q_u16(dst + 8, dc);
+ }
+}
+
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t a0 = vld1q_u16(above + 0);
+ const uint16x8_t a1 = vld1q_u16(above + 8);
+ const uint16x8_t l0 = vld1q_u16(left + 0);
+ const uint16x8_t l1 = vld1q_u16(left + 8);
+ const uint16x8_t pa = vaddq_u16(a0, a1);
+ const uint16x8_t pl = vaddq_u16(l0, l1);
+ const uint16x8_t pal0 = vaddq_u16(pa, pl);
+ const uint32_t sum = horizontal_add_uint16x8(pal0);
+ const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
+ (void)bd;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_16(left);
+ const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
+ (void)above;
+ (void)bd;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16_t sum = dc_sum_16(above);
+ const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_16x16(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint32_t dc_sum_32(const uint16_t *ref) {
+ const uint16x8_t r0 = vld1q_u16(ref + 0);
+ const uint16x8_t r1 = vld1q_u16(ref + 8);
+ const uint16x8_t r2 = vld1q_u16(ref + 16);
+ const uint16x8_t r3 = vld1q_u16(ref + 24);
+ const uint16x8_t p0 = vaddq_u16(r0, r1);
+ const uint16x8_t p1 = vaddq_u16(r2, r3);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ return horizontal_add_uint16x8(p2);
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+ const uint16x8_t dc) {
+ int i;
+ for (i = 0; i < 32; ++i) {
+ vst1q_u16(dst + 0, dc);
+ vst1q_u16(dst + 8, dc);
+ vst1q_u16(dst + 16, dc);
+ vst1q_u16(dst + 24, dc);
+ dst += stride;
+ }
+}
+
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t a0 = vld1q_u16(above + 0);
+ const uint16x8_t a1 = vld1q_u16(above + 8);
+ const uint16x8_t a2 = vld1q_u16(above + 16);
+ const uint16x8_t a3 = vld1q_u16(above + 24);
+ const uint16x8_t l0 = vld1q_u16(left + 0);
+ const uint16x8_t l1 = vld1q_u16(left + 8);
+ const uint16x8_t l2 = vld1q_u16(left + 16);
+ const uint16x8_t l3 = vld1q_u16(left + 24);
+ const uint16x8_t pa0 = vaddq_u16(a0, a1);
+ const uint16x8_t pa1 = vaddq_u16(a2, a3);
+ const uint16x8_t pl0 = vaddq_u16(l0, l1);
+ const uint16x8_t pl1 = vaddq_u16(l2, l3);
+ const uint16x8_t pa = vaddq_u16(pa0, pa1);
+ const uint16x8_t pl = vaddq_u16(pl0, pl1);
+ const uint16x8_t pal0 = vaddq_u16(pa, pl);
+ const uint32_t sum = horizontal_add_uint16x8(pal0);
+ const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0);
+ (void)bd;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint32_t sum = dc_sum_32(left);
+ const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
+ (void)above;
+ (void)bd;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint32_t sum = dc_sum_32(above);
+ const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
+ (void)left;
+ (void)bd;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_32x32(dst, stride, dc);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t a0, a1, a2, d0;
+ uint16_t a7;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above);
+ a7 = above[7];
+
+ // [ above[1], ..., above[6], x, x ]
+ a1 = vextq_u16(a0, a0, 1);
+ // [ above[2], ..., above[7], x, x ]
+ a2 = vextq_u16(a0, a0, 2);
+
+ // d0[0] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[5] = AVG3(above[5], above[6], above[7]);
+ // d0[6] = x (don't care)
+ // d0[7] = x (don't care)
+ d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+ // We want:
+ // stride=0 [ d0[0], d0[1], d0[2], d0[3] ]
+ // stride=1 [ d0[1], d0[2], d0[3], d0[4] ]
+ // stride=2 [ d0[2], d0[3], d0[4], d0[5] ]
+ // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+ vst1_u16(dst + 0 * stride, vget_low_u16(d0));
+ vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1)));
+ vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2)));
+ vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3)));
+
+ // We stored d0[6] above, so fixup into above[7].
+ dst[3 * stride + 3] = a7;
+}
+
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t ax0, a0, a1, a7, d0;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a7 = vld1q_dup_u16(above + 7);
+
+ // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+ // shift in above[7] later, so shift a0 across by one to get the right
+ // inputs:
+ // [ x, above[0], ... , above[6] ]
+ ax0 = vextq_u16(a0, a0, 7);
+
+ // d0[0] = x (don't care)
+ // d0[1] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[7] = AVG3(above[6], above[7], above[8]);
+ d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+
+ // Undo the earlier ext, incrementally shift in duplicates of above[7].
+ vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1));
+ vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2));
+ vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3));
+ vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4));
+ vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5));
+ vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6));
+ vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7));
+ vst1q_u16(dst + 7 * stride, a7);
+}
+
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2];
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ a9 = vld1q_u16(above + 9);
+ a15 = vld1q_dup_u16(above + 15);
+
+ // [ x, above[0], ... , above[6] ]
+ ax0 = vextq_u16(a0, a0, 7);
+
+ // We have one unused lane here to leave room to shift in above[15] in the
+ // last lane:
+ // d0[0][1] = x (don't care)
+ // d0[0][1] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[0][7] = AVG3(above[6], above[7], above[8]);
+ // d0[1][0] = AVG3(above[7], above[8], above[9]);
+ // ...
+ // d0[1][7] = AVG3(above[14], above[15], above[16]);
+ d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+ d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+
+ // Incrementally shift in duplicates of above[15].
+ vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+ vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1));
+ vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+ vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2));
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4));
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7));
+ vst1q_u16(dst + 7 * stride + 0, d0[1]);
+ vst1q_u16(dst + 7 * stride + 8, a15);
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1));
+ vst1q_u16(dst + 8 * stride + 8, a15);
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2));
+ vst1q_u16(dst + 9 * stride + 8, a15);
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3));
+ vst1q_u16(dst + 10 * stride + 8, a15);
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4));
+ vst1q_u16(dst + 11 * stride + 8, a15);
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5));
+ vst1q_u16(dst + 12 * stride + 8, a15);
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6));
+ vst1q_u16(dst + 13 * stride + 8, a15);
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7));
+ vst1q_u16(dst + 14 * stride + 8, a15);
+ vst1q_u16(dst + 15 * stride + 0, a15);
+ vst1q_u16(dst + 15 * stride + 8, a15);
+}
+
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4];
+ int i;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ a9 = vld1q_u16(above + 9);
+ a15 = vld1q_u16(above + 15);
+ a16 = vld1q_u16(above + 16);
+ a17 = vld1q_u16(above + 17);
+ a23 = vld1q_u16(above + 23);
+ a24 = vld1q_u16(above + 24);
+ a25 = vld1q_u16(above + 25);
+ a31 = vld1q_dup_u16(above + 31);
+
+ // [ x, above[0], ... , above[6] ]
+ ax0 = vextq_u16(a0, a0, 7);
+
+ d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+ d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+ d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16);
+ d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24);
+
+ for (i = 0; i < 32; ++i) {
+ d0[0] = vextq_u16(d0[0], d0[1], 1);
+ d0[1] = vextq_u16(d0[1], d0[2], 1);
+ d0[2] = vextq_u16(d0[2], d0[3], 1);
+ d0[3] = vextq_u16(d0[3], a31, 1);
+ vst1q_u16(dst + 0, d0[0]);
+ vst1q_u16(dst + 8, d0[1]);
+ vst1q_u16(dst + 16, d0[2]);
+ vst1q_u16(dst + 24, d0[3]);
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1_u16(above + 0);
+ a1 = vld1_u16(above + 1);
+ a2 = vld1_u16(above + 2);
+ a3 = vld1_u16(above + 3);
+
+ d0 = vrhadd_u16(a0, a1);
+ d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
+ d2 = vrhadd_u16(a1, a2);
+ d3 = vrhadd_u16(vhadd_u16(a1, a3), a2);
+
+ // Note that here we are performing a full avg calculation for the final
+ // elements rather than storing a duplicate of above[3], which differs
+ // (correctly) from the general scheme employed by the bs={8,16,32}
+ // implementations in order to match the original C implementation.
+ vst1_u16(dst + 0 * stride, d0);
+ vst1_u16(dst + 1 * stride, d1);
+ vst1_u16(dst + 2 * stride, d2);
+ vst1_u16(dst + 3 * stride, d3);
+}
+
+void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a2 = vld1q_u16(above + 2);
+ a7 = vld1q_dup_u16(above + 7);
+
+ d0 = vrhaddq_u16(a0, a1);
+ d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+ // We want to store:
+ // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
+ // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
+ // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7] ]
+ // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7] ]
+ // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7] ]
+ // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7] ]
+ // stride=6 [ d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7], a[7] ]
+ // stride=7 [ d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7], a[7] ]
+ // Note in particular that d0[7] and d1[7] are only ever referenced in the
+ // stride=0 and stride=1 cases respectively, and in later strides are
+ // replaced by a copy of above[7]. These are equivalent if for i>7,
+ // above[i]==above[7], however that is not always the case.
+
+ // Strip out d0[7] and d1[7] so that we can replace it with an additional
+ // copy of above[7], the first vector here doesn't matter so just reuse
+ // d0/d1.
+ d0_ext = vextq_u16(d0, d0, 7);
+ d1_ext = vextq_u16(d1, d1, 7);
+
+ // Shuffle in duplicates of above[7] and store.
+ vst1q_u16(dst + 0 * stride, d0);
+ vst1q_u16(dst + 1 * stride, d1);
+ vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2));
+ vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2));
+ vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3));
+ vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3));
+ vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4));
+ vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4));
+}
+
+void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+ uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a2 = vld1q_u16(above + 2);
+ a8 = vld1q_u16(above + 8);
+ a9 = vld1q_u16(above + 9);
+ a10 = vld1q_u16(above + 10);
+ a15 = vld1q_dup_u16(above + 15);
+
+ d0[0] = vrhaddq_u16(a0, a1);
+ d0[1] = vrhaddq_u16(a8, a9);
+ d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+ d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+
+ // Strip out the final element of d0/d1 so that we can replace it with an
+ // additional copy of above[7], the first vector here doesn't matter so just
+ // reuse the same vector.
+ d0_ext = vextq_u16(d0[1], d0[1], 7);
+ d1_ext = vextq_u16(d1[1], d1[1], 7);
+
+ // Shuffle in duplicates of above[7] and store. Note that cases involving
+ // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+ // element from above.
+ vst1q_u16(dst + 0 * stride + 0, d0[0]);
+ vst1q_u16(dst + 0 * stride + 8, d0[1]);
+ vst1q_u16(dst + 1 * stride + 0, d1[0]);
+ vst1q_u16(dst + 1 * stride + 8, d1[1]);
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2));
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4));
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5));
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6));
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7));
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+ vst1q_u16(dst + 14 * stride + 8, a15);
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 15 * stride + 8, a15);
+}
+
+void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+ uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
+ d1[4], d0_ext, d1_ext;
+ (void)left;
+ (void)bd;
+
+ a0 = vld1q_u16(above + 0);
+ a1 = vld1q_u16(above + 1);
+ a2 = vld1q_u16(above + 2);
+ a8 = vld1q_u16(above + 8);
+ a9 = vld1q_u16(above + 9);
+ a10 = vld1q_u16(above + 10);
+ a16 = vld1q_u16(above + 16);
+ a17 = vld1q_u16(above + 17);
+ a18 = vld1q_u16(above + 18);
+ a24 = vld1q_u16(above + 24);
+ a25 = vld1q_u16(above + 25);
+ a26 = vld1q_u16(above + 26);
+ a31 = vld1q_dup_u16(above + 31);
+
+ d0[0] = vrhaddq_u16(a0, a1);
+ d0[1] = vrhaddq_u16(a8, a9);
+ d0[2] = vrhaddq_u16(a16, a17);
+ d0[3] = vrhaddq_u16(a24, a25);
+ d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+ d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+ d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
+ d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
+
+ // Strip out the final element of d0/d1 so that we can replace it with an
+ // additional copy of above[7], the first vector here doesn't matter so just
+ // reuse the same vector.
+ d0_ext = vextq_u16(d0[3], d0[3], 7);
+ d1_ext = vextq_u16(d1[3], d1[3], 7);
+
+ // Shuffle in duplicates of above[7] and store. Note that cases involving
+ // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+ // element from above.
+
+ vst1q_u16(dst + 0 * stride + 0, d0[0]);
+ vst1q_u16(dst + 0 * stride + 8, d0[1]);
+ vst1q_u16(dst + 0 * stride + 16, d0[2]);
+ vst1q_u16(dst + 0 * stride + 24, d0[3]);
+ vst1q_u16(dst + 1 * stride + 0, d1[0]);
+ vst1q_u16(dst + 1 * stride + 8, d1[1]);
+ vst1q_u16(dst + 1 * stride + 16, d1[2]);
+ vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
+ vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
+ vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
+ vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
+ vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2));
+
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
+ vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
+ vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
+ vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
+ vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3));
+
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
+ vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
+ vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
+ vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
+ vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4));
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
+ vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
+ vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
+ vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
+ vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5));
+
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
+ vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
+ vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
+ vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
+ vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6));
+
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
+ vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
+ vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
+ vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
+ vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7));
+
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
+ vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
+ vst1q_u16(dst + 14 * stride + 24, a31);
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
+ vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
+ vst1q_u16(dst + 15 * stride + 24, a31);
+
+ vst1q_u16(dst + 16 * stride + 0, d0[1]);
+ vst1q_u16(dst + 16 * stride + 8, d0[2]);
+ vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1));
+ vst1q_u16(dst + 16 * stride + 24, a31);
+ vst1q_u16(dst + 17 * stride + 0, d1[1]);
+ vst1q_u16(dst + 17 * stride + 8, d1[2]);
+ vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1));
+ vst1q_u16(dst + 17 * stride + 24, a31);
+
+ vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
+ vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
+ vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2));
+ vst1q_u16(dst + 18 * stride + 24, a31);
+ vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
+ vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
+ vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2));
+ vst1q_u16(dst + 19 * stride + 24, a31);
+
+ vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
+ vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
+ vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3));
+ vst1q_u16(dst + 20 * stride + 24, a31);
+ vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
+ vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
+ vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3));
+ vst1q_u16(dst + 21 * stride + 24, a31);
+
+ vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
+ vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
+ vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4));
+ vst1q_u16(dst + 22 * stride + 24, a31);
+ vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
+ vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
+ vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4));
+ vst1q_u16(dst + 23 * stride + 24, a31);
+
+ vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
+ vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
+ vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5));
+ vst1q_u16(dst + 24 * stride + 24, a31);
+ vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
+ vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
+ vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5));
+ vst1q_u16(dst + 25 * stride + 24, a31);
+
+ vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
+ vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
+ vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6));
+ vst1q_u16(dst + 26 * stride + 24, a31);
+ vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
+ vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
+ vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6));
+ vst1q_u16(dst + 27 * stride + 24, a31);
+
+ vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
+ vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
+ vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7));
+ vst1q_u16(dst + 28 * stride + 24, a31);
+ vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
+ vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
+ vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7));
+ vst1q_u16(dst + 29 * stride + 24, a31);
+
+ vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
+ vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
+ vst1q_u16(dst + 30 * stride + 16, a31);
+ vst1q_u16(dst + 30 * stride + 24, a31);
+ vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
+ vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
+ vst1q_u16(dst + 31 * stride + 16, a31);
+ vst1q_u16(dst + 31 * stride + 24, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+ (void)bd;
+
+ az = vld1_u16(above - 1);
+ a0 = vld1_u16(above + 0);
+ // [ left[0], above[-1], above[0], above[1] ]
+ l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+ l0 = vld1_u16(left + 0);
+ // The last lane here is unused, reading left[4] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], left[2], left[3], x ]
+ l1 = vext_u16(l0, l0, 1);
+ // [ above[-1], left[0], left[1], left[2] ]
+ azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+ d0 = vrhadd_u16(az, a0);
+ d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+
+ col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+ col0_even = vdup_lane_u16(col0, 0);
+ col0_odd = vdup_lane_u16(col0, 1);
+
+ vst1_u16(dst + 0 * stride, d0);
+ vst1_u16(dst + 1 * stride, d1);
+ vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3));
+ vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3));
+}
+
+void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ // [ left[0], above[-1], ..., left[5] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left + 0);
+ // The last lane here is unused, reading left[8] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[7], x ]
+ l1 = vextq_u16(l0, l0, 1);
+ // [ above[-1], left[0], ..., left[6] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ // d0[0] = AVG2(above[-1], above[0])
+ // ...
+ // d0[7] = AVG2(above[6], above[7])
+ d0 = vrhaddq_u16(az, a0);
+
+ // d1[0] = AVG3(left[0], above[-1], above[0])
+ // d1[1] = AVG3(above[-1], above[0], above[1])
+ // ...
+ // d1[7] = AVG3(above[5], above[6], above[7])
+ d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+ // The ext instruction shifts elements in from the end of the vector rather
+ // than the start, so reverse the vector to put the elements to be shifted in
+ // at the end:
+ // col0[7] = AVG3(above[-1], left[0], left[1])
+ // col0[6] = AVG3(left[0], left[1], left[2])
+ // ...
+ // col0[0] = AVG3(left[6], left[7], left[8])
+ col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+ col0 = vrev64q_u16(vextq_u16(col0, col0, 4));
+
+ // We don't care about the first parameter to this uzp since we only ever use
+ // the high three elements, we just use col0 again since it is already
+ // available:
+ // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+ // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+ col0_even = vuzpq_u16(col0, col0).val[1];
+ col0_odd = vuzpq_u16(col0, col0).val[0];
+
+ // Incrementally shift more elements from col0 into d0/1:
+ // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
+ // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
+ // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ]
+ // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+ // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ]
+ // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ]
+ // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+ // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+ vst1q_u16(dst + 0 * stride, d0);
+ vst1q_u16(dst + 1 * stride, d1);
+ vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7));
+ vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7));
+ vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6));
+ vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6));
+ vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5));
+ vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5));
+}
+
+void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo,
+ col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi;
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ a6 = vld1q_u16(above + 6);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ // [ left[0], above[-1], ..., left[5] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l7 = vld1q_u16(left + 7);
+ l8 = vld1q_u16(left + 8);
+ // The last lane here is unused, reading left[16] could cause a buffer
+ // over-read, so just fill with a duplicate of left[8] to avoid needing to
+ // materialize a zero:
+ // [ left[9], ... , left[15], x ]
+ l9 = vextq_u16(l8, l8, 1);
+ // [ above[-1], left[0], ..., left[6] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ d0_lo = vrhaddq_u16(az, a0);
+ d0_hi = vrhaddq_u16(a7, a8);
+ d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+ d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+
+ col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+ col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+ // Reverse within each vector, then swap the array indices in the uzp to
+ // complete the reversal across all 16 elements.
+ col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4));
+ col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4));
+ col0_even = vuzpq_u16(col0_hi, col0_lo).val[1];
+ col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0];
+
+ vst1q_u16(dst + 0 * stride + 0, d0_lo);
+ vst1q_u16(dst + 0 * stride + 8, d0_hi);
+ vst1q_u16(dst + 1 * stride + 0, d1_lo);
+ vst1q_u16(dst + 1 * stride + 8, d1_hi);
+
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7));
+
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6));
+
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5));
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4));
+
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3));
+
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2));
+
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1));
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1));
+}
+
+void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+ l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4],
+ col0_even[2], col0_odd[2];
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ a6 = vld1q_u16(above + 6);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ a14 = vld1q_u16(above + 14);
+ a15 = vld1q_u16(above + 15);
+ a16 = vld1q_u16(above + 16);
+ a22 = vld1q_u16(above + 22);
+ a23 = vld1q_u16(above + 23);
+ a24 = vld1q_u16(above + 24);
+ // [ left[0], above[-1], ..., left[5] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l7 = vld1q_u16(left + 7);
+ l8 = vld1q_u16(left + 8);
+ l9 = vld1q_u16(left + 9);
+ l15 = vld1q_u16(left + 15);
+ l16 = vld1q_u16(left + 16);
+ l17 = vld1q_u16(left + 17);
+ l23 = vld1q_u16(left + 23);
+ l24 = vld1q_u16(left + 24);
+ l25 = vld1q_u16(left + 25);
+ // The last lane here is unused, reading left[32] could cause a buffer
+ // over-read, so just fill with a duplicate of left[24] to avoid needing to
+ // materialize a zero:
+ // [ left[25], ... , left[31], x ]
+ l25 = vextq_u16(l24, l24, 1);
+ // [ above[-1], left[0], ..., left[6] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ d0[0] = vrhaddq_u16(az, a0);
+ d0[1] = vrhaddq_u16(a7, a8);
+ d0[2] = vrhaddq_u16(a15, a16);
+ d0[3] = vrhaddq_u16(a23, a24);
+ d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+ d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+ d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+ d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+ col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+ col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+ col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+ col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+ // Reverse within each vector, then swap the array indices in both the uzp
+ // and the col0_{even,odd} assignment to complete the reversal across all
+ // 32-elements.
+ col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4));
+ col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4));
+ col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4));
+ col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4));
+
+ col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1];
+ col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1];
+ col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0];
+ col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0];
+
+ vst1q_u16(dst + 0 * stride + 0, d0[0]);
+ vst1q_u16(dst + 0 * stride + 8, d0[1]);
+ vst1q_u16(dst + 0 * stride + 16, d0[2]);
+ vst1q_u16(dst + 0 * stride + 24, d0[3]);
+ vst1q_u16(dst + 1 * stride + 0, d1[0]);
+ vst1q_u16(dst + 1 * stride + 8, d1[1]);
+ vst1q_u16(dst + 1 * stride + 16, d1[2]);
+ vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7));
+ vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7));
+ vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+ vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6));
+ vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6));
+ vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6));
+ vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6));
+ vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6));
+
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5));
+ vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5));
+ vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+ vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4));
+ vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4));
+ vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4));
+ vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4));
+ vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4));
+
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3));
+ vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3));
+ vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+ vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2));
+ vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2));
+ vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2));
+ vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2));
+ vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2));
+
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1));
+ vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1));
+ vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1));
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+ vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+ vst1q_u16(dst + 16 * stride + 0, col0_even[1]);
+ vst1q_u16(dst + 16 * stride + 8, d0[0]);
+ vst1q_u16(dst + 16 * stride + 16, d0[1]);
+ vst1q_u16(dst + 16 * stride + 24, d0[2]);
+ vst1q_u16(dst + 17 * stride + 0, col0_odd[1]);
+ vst1q_u16(dst + 17 * stride + 8, d1[0]);
+ vst1q_u16(dst + 17 * stride + 16, d1[1]);
+ vst1q_u16(dst + 17 * stride + 24, d1[2]);
+
+ vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7));
+ vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7));
+ vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7));
+ vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7));
+ vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7));
+ vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7));
+ vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+
+ vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6));
+ vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6));
+ vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6));
+ vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6));
+ vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6));
+ vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6));
+ vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6));
+ vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6));
+
+ vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5));
+ vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5));
+ vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5));
+ vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5));
+ vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5));
+ vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5));
+ vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+
+ vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4));
+ vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4));
+ vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4));
+ vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4));
+ vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4));
+ vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4));
+ vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4));
+ vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4));
+
+ vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3));
+ vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3));
+ vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3));
+ vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3));
+ vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3));
+ vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3));
+ vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+
+ vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2));
+ vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2));
+ vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2));
+ vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2));
+ vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2));
+ vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2));
+ vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2));
+ vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2));
+
+ vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1));
+ vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1));
+ vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1));
+ vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1));
+ vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1));
+ vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1));
+ vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+ uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi;
+ (void)bd;
+
+ az = vld1_u16(above - 1);
+ a0 = vld1_u16(above + 0);
+ // [ left[0], above[-1], above[0], above[1] ]
+ l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+ l0 = vld1_u16(left);
+ // The last lane here is unused, reading left[4] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], left[2], left[3], x ]
+ l1 = vext_u16(l0, l0, 1);
+ // [ above[-1], left[0], left[1], left[2] ]
+ azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+ d0 = vrhadd_u16(azl0, l0);
+ d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+ d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+
+ d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0];
+ d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1];
+
+ // Incrementally shift more elements from d0/d2 reversed into d1:
+ // stride=0 [ d0[0], d1[0], d1[1], d1[2] ]
+ // stride=1 [ d0[1], d2[0], d0[0], d1[0] ]
+ // stride=2 [ d0[2], d2[1], d0[1], d2[0] ]
+ // stride=3 [ d0[3], d2[2], d0[2], d2[1] ]
+ vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3));
+ vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1));
+ vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3));
+ vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo,
+ d20_hi;
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ // [ left[0], above[-1], ... , above[5] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left);
+ // The last lane here is unused, reading left[8] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[7], x ]
+ l1 = vextq_u16(l0, l0, 1);
+ // [ above[-1], left[0], ... , left[6] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ // d0[0] = AVG2(above[-1], left[0])
+ // d0[1] = AVG2(left[0], left[1])
+ // ...
+ // d0[7] = AVG2(left[6], left[7])
+ d0 = vrhaddq_u16(azl0, l0);
+
+ // d1[0] = AVG3(left[0], above[-1], above[0])
+ // d1[1] = AVG3(above[-1], above[0], above[1])
+ // ...
+ // d1[7] = AVG3(above[5], above[6], above[7])
+ d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+ // d2[0] = AVG3(above[-1], left[0], left[1])
+ // d2[1] = AVG3(left[0], left[1], left[2])
+ // ...
+ // d2[7] = AVG3(left[6], left[7], left[8])
+ d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+
+ // The ext instruction shifts elements in from the end of the vector rather
+ // than the start, so reverse the vectors to put the elements to be shifted
+ // in at the end:
+ d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4));
+ d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4));
+
+ d20_lo = vzipq_u16(d2_rev, d0_rev).val[0];
+ d20_hi = vzipq_u16(d2_rev, d0_rev).val[1];
+
+ // Incrementally shift more elements from d0/d2 reversed into d1:
+ // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+ // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+ // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+ // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+ // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+ // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+ // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+ // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+ vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7));
+ vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5));
+ vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3));
+ vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1));
+ vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7));
+ vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5));
+ vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3));
+ vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+ uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2],
+ d2[2], d20[4];
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ a6 = vld1q_u16(above + 6);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l7 = vld1q_u16(left + 7);
+ l8 = vld1q_u16(left + 8);
+ // The last lane here is unused, reading left[16] could cause a buffer
+ // over-read, so just fill with a duplicate of left[8] to avoid needing to
+ // materialize a zero:
+ // [ left[9], ... , left[15], x ]
+ l9 = vextq_u16(l8, l8, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ d0[0] = vrhaddq_u16(azl0, l0);
+ d0[1] = vrhaddq_u16(l7, l8);
+ d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+ d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+ d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+ d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+ d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+ d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+ d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+ d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+
+ d20[0] = vzipq_u16(d2[1], d0[1]).val[0];
+ d20[1] = vzipq_u16(d2[1], d0[1]).val[1];
+ d20[2] = vzipq_u16(d2[0], d0[0]).val[0];
+ d20[3] = vzipq_u16(d2[0], d0[0]).val[1];
+
+ vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7));
+ vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5));
+ vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1));
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+}
+
+void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+ uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+ l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8];
+ (void)bd;
+
+ az = vld1q_u16(above - 1);
+ a0 = vld1q_u16(above + 0);
+ a6 = vld1q_u16(above + 6);
+ a7 = vld1q_u16(above + 7);
+ a8 = vld1q_u16(above + 8);
+ a14 = vld1q_u16(above + 14);
+ a15 = vld1q_u16(above + 15);
+ a16 = vld1q_u16(above + 16);
+ a22 = vld1q_u16(above + 22);
+ a23 = vld1q_u16(above + 23);
+ a24 = vld1q_u16(above + 24);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l7 = vld1q_u16(left + 7);
+ l8 = vld1q_u16(left + 8);
+ l9 = vld1q_u16(left + 9);
+ l15 = vld1q_u16(left + 15);
+ l16 = vld1q_u16(left + 16);
+ l17 = vld1q_u16(left + 17);
+ l23 = vld1q_u16(left + 23);
+ l24 = vld1q_u16(left + 24);
+ // The last lane here is unused, reading left[32] could cause a buffer
+ // over-read, so just fill with a duplicate of left[24] to avoid needing to
+ // materialize a zero:
+ // [ left[25], ... , left[31], x ]
+ l25 = vextq_u16(l24, l24, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+ d0[0] = vrhaddq_u16(azl0, l0);
+ d0[1] = vrhaddq_u16(l7, l8);
+ d0[2] = vrhaddq_u16(l15, l16);
+ d0[3] = vrhaddq_u16(l23, l24);
+
+ d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+ d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+ d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+ d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+ d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+ d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+ d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+ d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+ d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+ d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+ d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4));
+ d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4));
+ d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+ d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+ d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4));
+ d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4));
+
+ d20[0] = vzipq_u16(d2[3], d0[3]).val[0];
+ d20[1] = vzipq_u16(d2[3], d0[3]).val[1];
+ d20[2] = vzipq_u16(d2[2], d0[2]).val[0];
+ d20[3] = vzipq_u16(d2[2], d0[2]).val[1];
+ d20[4] = vzipq_u16(d2[1], d0[1]).val[0];
+ d20[5] = vzipq_u16(d2[1], d0[1]).val[1];
+ d20[6] = vzipq_u16(d2[0], d0[0]).val[0];
+ d20[7] = vzipq_u16(d2[0], d0[0]).val[1];
+
+ vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7));
+ vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+ vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+ vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5));
+ vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+ vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+ vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+ vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+ vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7));
+ vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7));
+ vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5));
+ vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3));
+ vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1));
+ vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+ vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+
+ vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7));
+ vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7));
+ vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7));
+ vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7));
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5));
+ vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5));
+ vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5));
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3));
+ vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3));
+ vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1));
+ vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1));
+ vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1));
+
+ vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7));
+ vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7));
+ vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7));
+ vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7));
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5));
+ vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5));
+ vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5));
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3));
+ vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3));
+ vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3));
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1));
+ vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1));
+ vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1));
+
+ vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7));
+ vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7));
+ vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7));
+ vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7));
+ vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5));
+ vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5));
+ vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5));
+ vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5));
+ vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3));
+ vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3));
+ vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3));
+ vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3));
+ vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1));
+ vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1));
+ vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1));
+ vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1));
+
+ vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+ vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7));
+ vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7));
+ vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7));
+ vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+ vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5));
+ vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5));
+ vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5));
+ vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+ vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3));
+ vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3));
+ vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3));
+ vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+ vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1));
+ vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1));
+ vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1));
+
+ vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+ vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+ vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7));
+ vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7));
+ vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+ vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+ vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5));
+ vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5));
+ vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+ vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+ vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3));
+ vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3));
+ vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+ vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+ vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1));
+ vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1));
+
+ vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+ vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+ vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7));
+ vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7));
+ vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+ vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+ vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5));
+ vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5));
+ vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+ vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+ vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3));
+ vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3));
+ vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+ vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+ vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1));
+ vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t XA0123___ = vld1q_u16(above - 1);
+ const uint16x4_t L0123 = vld1_u16(left);
+ const uint16x4_t L3210 = vrev64_u16(L0123);
+ const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
+ const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
+ const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
+ const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
+ const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
+ const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
+ const uint16x4_t row_0 = vget_low_u16(avg2);
+ const uint16x4_t row_1 = vget_high_u16(avg2);
+ const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
+ const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
+ const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
+ (void)bd;
+ vst1_u16(dst, r0);
+ dst += stride;
+ vst1_u16(dst, r1);
+ dst += stride;
+ vst1_u16(dst, r2);
+ dst += stride;
+ vst1_u16(dst, row_0);
+}
+
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+ const uint16x8_t A01234567 = vld1q_u16(above);
+ const uint16x8_t A1234567_ = vld1q_u16(above + 1);
+ const uint16x8_t L01234567 = vld1q_u16(left);
+ const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+ const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+ const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+ const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+ const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
+ const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
+ const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
+ const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
+ const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
+ const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
+ const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
+ const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
+ const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
+ const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
+ const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
+ (void)bd;
+ vst1q_u16(dst, r0);
+ dst += stride;
+ vst1q_u16(dst, r1);
+ dst += stride;
+ vst1q_u16(dst, r2);
+ dst += stride;
+ vst1q_u16(dst, r3);
+ dst += stride;
+ vst1q_u16(dst, r4);
+ dst += stride;
+ vst1q_u16(dst, r5);
+ dst += stride;
+ vst1q_u16(dst, r6);
+ dst += stride;
+ vst1q_u16(dst, row_0);
+}
+
+static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row_0,
+ const uint16x8_t row_1) {
+ vst1q_u16(*dst, row_0);
+ *dst += 8;
+ vst1q_u16(*dst, row_1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t L01234567 = vld1q_u16(left);
+ const uint16x8_t L89abcdef = vld1q_u16(left + 8);
+ const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+ const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+ const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
+ const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
+ const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+ const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
+ const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
+ const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
+ const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
+
+ const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+ const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+ const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+ const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
+ const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
+
+ const uint16x8_t A01234567 = vld1q_u16(above);
+ const uint16x8_t A12345678 = vld1q_u16(above + 1);
+ const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
+ const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
+
+ const uint16x8_t A789abcde = vld1q_u16(above + 7);
+ const uint16x8_t A89abcdef = vld1q_u16(above + 8);
+ const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
+ const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
+ const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
+
+ const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
+ const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
+ const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
+ const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
+ const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
+ const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
+ const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
+ const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
+ const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
+ const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
+ const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
+ const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
+ const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
+ const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
+ const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
+ const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
+ const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
+ const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
+ const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
+ const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
+ const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
+ (void)bd;
+
+ d135_store_16(&dst, stride, r0_0, r0_1);
+ d135_store_16(&dst, stride, r1_0, r1_1);
+ d135_store_16(&dst, stride, r2_0, r2_1);
+ d135_store_16(&dst, stride, r3_0, r3_1);
+ d135_store_16(&dst, stride, r4_0, r4_1);
+ d135_store_16(&dst, stride, r5_0, r5_1);
+ d135_store_16(&dst, stride, r6_0, r6_1);
+ d135_store_16(&dst, stride, row_1, row_2);
+ d135_store_16(&dst, stride, r8_0, r0_0);
+ d135_store_16(&dst, stride, r9_0, r1_0);
+ d135_store_16(&dst, stride, ra_0, r2_0);
+ d135_store_16(&dst, stride, rb_0, r3_0);
+ d135_store_16(&dst, stride, rc_0, r4_0);
+ d135_store_16(&dst, stride, rd_0, r5_0);
+ d135_store_16(&dst, stride, re_0, r6_0);
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+}
+
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t LL01234567 = vld1q_u16(left + 16);
+ const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
+ const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
+ const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
+ const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
+ const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
+ const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
+ const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
+ const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
+ const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
+
+ const uint16x8_t LU01234567 = vld1q_u16(left);
+ const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
+ const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
+ const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
+ const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
+ const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
+ const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
+ const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
+ const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
+ const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
+ const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
+
+ const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
+ const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
+ const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
+ uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
+
+ const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
+ const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
+ const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
+ const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
+ uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
+
+ const uint16x8_t AL01234567 = vld1q_u16(above);
+ const uint16x8_t AL12345678 = vld1q_u16(above + 1);
+ const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
+ uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
+
+ const uint16x8_t AL789abcde = vld1q_u16(above + 7);
+ const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
+ const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
+ const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
+ uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
+
+ const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
+ const uint16x8_t AR01234567 = vld1q_u16(above + 16);
+ const uint16x8_t AR12345678 = vld1q_u16(above + 17);
+ const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
+ uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
+
+ const uint16x8_t AR789abcde = vld1q_u16(above + 23);
+ const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
+ const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
+ const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
+ uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
+ int i, j;
+ (void)bd;
+
+ dst += 31 * stride;
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 8; ++j) {
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst -= stride + 24;
+ row_0 = vextq_u16(row_0, row_1, 1);
+ row_1 = vextq_u16(row_1, row_2, 1);
+ row_2 = vextq_u16(row_2, row_3, 1);
+ row_3 = vextq_u16(row_3, row_4, 1);
+ row_4 = vextq_u16(row_4, row_4, 1);
+ }
+ row_4 = row_5;
+ row_5 = row_6;
+ row_6 = row_7;
+ }
+}
+
+//------------------------------------------------------------------------------
+
+void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi;
+ (void)above;
+ (void)bd;
+
+ l0 = vld1_u16(left + 0);
+ l3 = vld1_dup_u16(left + 3);
+
+ // [ left[1], left[2], left[3], left[3] ]
+ l1 = vext_u16(l0, l3, 1);
+ // [ left[2], left[3], left[3], left[3] ]
+ l2 = vext_u16(l0, l3, 2);
+
+ c0 = vrhadd_u16(l0, l1);
+ c1 = vrhadd_u16(vhadd_u16(l0, l2), l1);
+
+ c01_lo = vzip_u16(c0, c1).val[0];
+ c01_hi = vzip_u16(c0, c1).val[1];
+
+ // stride=0 [ c0[0], c1[0], c0[1], c1[1] ]
+ // stride=1 [ c0[1], c1[1], c0[2], c1[2] ]
+ // stride=2 [ c0[2], c1[2], c0[3], c1[3] ]
+ // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+ vst1_u16(dst + 0 * stride, c01_lo);
+ vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2));
+ vst1_u16(dst + 2 * stride, c01_hi);
+ vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2));
+}
+
+void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi;
+ (void)above;
+ (void)bd;
+
+ l0 = vld1q_u16(left + 0);
+ l7 = vld1q_dup_u16(left + 7);
+
+ // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+ l1 = vextq_u16(l0, l7, 1);
+ // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+ l2 = vextq_u16(l0, l7, 2);
+
+ c0 = vrhaddq_u16(l0, l1);
+ c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+
+ c01_lo = vzipq_u16(c0, c1).val[0];
+ c01_hi = vzipq_u16(c0, c1).val[1];
+
+ vst1q_u16(dst + 0 * stride, c01_lo);
+ vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2));
+ vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4));
+ vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6));
+ vst1q_u16(dst + 4 * stride, c01_hi);
+ vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2));
+ vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4));
+ vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6));
+}
+
+void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4];
+ (void)above;
+ (void)bd;
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l2 = vld1q_u16(left + 2);
+ l8 = vld1q_u16(left + 8);
+ l15 = vld1q_dup_u16(left + 15);
+
+ l9 = vextq_u16(l8, l15, 1);
+ l10 = vextq_u16(l8, l15, 2);
+
+ c0[0] = vrhaddq_u16(l0, l1);
+ c0[1] = vrhaddq_u16(l8, l9);
+ c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+ c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+
+ c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+ c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+ c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+ c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+
+ vst1q_u16(dst + 0 * stride + 0, c01[0]);
+ vst1q_u16(dst + 0 * stride + 8, c01[1]);
+ vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+ vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+
+ vst1q_u16(dst + 4 * stride + 0, c01[1]);
+ vst1q_u16(dst + 4 * stride + 8, c01[2]);
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+
+ vst1q_u16(dst + 8 * stride + 0, c01[2]);
+ vst1q_u16(dst + 8 * stride + 8, c01[3]);
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2));
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6));
+
+ vst1q_u16(dst + 12 * stride + 0, c01[3]);
+ vst1q_u16(dst + 12 * stride + 8, l15);
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2));
+ vst1q_u16(dst + 13 * stride + 8, l15);
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4));
+ vst1q_u16(dst + 14 * stride + 8, l15);
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6));
+ vst1q_u16(dst + 15 * stride + 8, l15);
+}
+
+void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4],
+ c1[4], c01[8];
+ (void)above;
+ (void)bd;
+
+ l0 = vld1q_u16(left + 0);
+ l1 = vld1q_u16(left + 1);
+ l2 = vld1q_u16(left + 2);
+ l8 = vld1q_u16(left + 8);
+ l9 = vld1q_u16(left + 9);
+ l10 = vld1q_u16(left + 10);
+ l16 = vld1q_u16(left + 16);
+ l17 = vld1q_u16(left + 17);
+ l18 = vld1q_u16(left + 18);
+ l24 = vld1q_u16(left + 24);
+ l31 = vld1q_dup_u16(left + 31);
+
+ l25 = vextq_u16(l24, l31, 1);
+ l26 = vextq_u16(l24, l31, 2);
+
+ c0[0] = vrhaddq_u16(l0, l1);
+ c0[1] = vrhaddq_u16(l8, l9);
+ c0[2] = vrhaddq_u16(l16, l17);
+ c0[3] = vrhaddq_u16(l24, l25);
+ c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+ c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+ c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17);
+ c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25);
+
+ c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+ c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+ c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+ c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+ c01[4] = vzipq_u16(c0[2], c1[2]).val[0];
+ c01[5] = vzipq_u16(c0[2], c1[2]).val[1];
+ c01[6] = vzipq_u16(c0[3], c1[3]).val[0];
+ c01[7] = vzipq_u16(c0[3], c1[3]).val[1];
+
+ vst1q_u16(dst + 0 * stride + 0, c01[0]);
+ vst1q_u16(dst + 0 * stride + 8, c01[1]);
+ vst1q_u16(dst + 0 * stride + 16, c01[2]);
+ vst1q_u16(dst + 0 * stride + 24, c01[3]);
+ vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+ vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+ vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2));
+ vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2));
+ vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+ vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+ vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4));
+ vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4));
+ vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+ vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+ vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6));
+ vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6));
+
+ vst1q_u16(dst + 4 * stride + 0, c01[1]);
+ vst1q_u16(dst + 4 * stride + 8, c01[2]);
+ vst1q_u16(dst + 4 * stride + 16, c01[3]);
+ vst1q_u16(dst + 4 * stride + 24, c01[4]);
+ vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+ vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+ vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2));
+ vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2));
+ vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+ vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+ vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4));
+ vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4));
+ vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+ vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+ vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6));
+ vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6));
+
+ vst1q_u16(dst + 8 * stride + 0, c01[2]);
+ vst1q_u16(dst + 8 * stride + 8, c01[3]);
+ vst1q_u16(dst + 8 * stride + 16, c01[4]);
+ vst1q_u16(dst + 8 * stride + 24, c01[5]);
+ vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+ vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2));
+ vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2));
+ vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2));
+ vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+ vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4));
+ vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4));
+ vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4));
+ vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+ vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6));
+ vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6));
+ vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6));
+
+ vst1q_u16(dst + 12 * stride + 0, c01[3]);
+ vst1q_u16(dst + 12 * stride + 8, c01[4]);
+ vst1q_u16(dst + 12 * stride + 16, c01[5]);
+ vst1q_u16(dst + 12 * stride + 24, c01[6]);
+ vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2));
+ vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2));
+ vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2));
+ vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2));
+ vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4));
+ vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4));
+ vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4));
+ vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4));
+ vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6));
+ vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6));
+ vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6));
+ vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6));
+
+ vst1q_u16(dst + 16 * stride + 0, c01[4]);
+ vst1q_u16(dst + 16 * stride + 8, c01[5]);
+ vst1q_u16(dst + 16 * stride + 16, c01[6]);
+ vst1q_u16(dst + 16 * stride + 24, c01[7]);
+ vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2));
+ vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2));
+ vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2));
+ vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2));
+ vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4));
+ vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4));
+ vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4));
+ vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4));
+ vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6));
+ vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6));
+ vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6));
+ vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6));
+
+ vst1q_u16(dst + 20 * stride + 0, c01[5]);
+ vst1q_u16(dst + 20 * stride + 8, c01[6]);
+ vst1q_u16(dst + 20 * stride + 16, c01[7]);
+ vst1q_u16(dst + 20 * stride + 24, l31);
+ vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2));
+ vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2));
+ vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2));
+ vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4));
+ vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4));
+ vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4));
+ vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6));
+ vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6));
+ vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6));
+ vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6));
+
+ vst1q_u16(dst + 24 * stride + 0, c01[6]);
+ vst1q_u16(dst + 24 * stride + 8, c01[7]);
+ vst1q_u16(dst + 24 * stride + 16, l31);
+ vst1q_u16(dst + 24 * stride + 24, l31);
+ vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2));
+ vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2));
+ vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4));
+ vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4));
+ vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6));
+ vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6));
+ vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6));
+ vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6));
+
+ vst1q_u16(dst + 28 * stride + 0, c01[7]);
+ vst1q_u16(dst + 28 * stride + 8, l31);
+ vst1q_u16(dst + 28 * stride + 16, l31);
+ vst1q_u16(dst + 28 * stride + 24, l31);
+ vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2));
+ vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2));
+ vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4));
+ vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4));
+ vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6));
+ vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6));
+ vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6));
+ vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6));
+}
+
+//------------------------------------------------------------------------------
+
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t row = vld1_u16(above);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, dst += stride) {
+ vst1_u16(dst, row);
+ }
+}
+
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t row = vld1q_u16(above);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 8; i++, dst += stride) {
+ vst1q_u16(dst, row);
+ }
+}
+
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t row0 = vld1q_u16(above + 0);
+ const uint16x8_t row1 = vld1q_u16(above + 8);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 16; i++) {
+ vst1q_u16(dst + 0, row0);
+ vst1q_u16(dst + 8, row1);
+ dst += stride;
+ }
+}
+
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t row0 = vld1q_u16(above + 0);
+ const uint16x8_t row1 = vld1q_u16(above + 8);
+ const uint16x8_t row2 = vld1q_u16(above + 16);
+ const uint16x8_t row3 = vld1q_u16(above + 24);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 32; i++) {
+ vst1q_u16(dst + 0, row0);
+ vst1q_u16(dst + 8, row1);
+ vst1q_u16(dst + 16, row2);
+ vst1q_u16(dst + 24, row3);
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t left_u16 = vld1_u16(left);
+ uint16x4_t row;
+ (void)above;
+ (void)bd;
+
+ row = vdup_lane_u16(left_u16, 0);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 1);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 2);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 3);
+ vst1_u16(dst, row);
+}
+
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t left_u16 = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16);
+ const uint16x4_t left_high = vget_high_u16(left_u16);
+ uint16x8_t row;
+ (void)above;
+ (void)bd;
+
+ row = vdupq_lane_u16(left_low, 0);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 1);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 2);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 3);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 0);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 1);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 2);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 3);
+ vst1q_u16(dst, row);
+}
+
+static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row) {
+ // Note: vst1q is faster than vst2q
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ const uint16x8_t left_u16q = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16q);
+ const uint16x4_t left_high = vget_high_u16(left_u16q);
+ uint16x8_t row;
+
+ row = vdupq_lane_u16(left_low, 0);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 1);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 2);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 3);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 0);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 1);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 2);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 3);
+ h_store_16(&dst, stride, row);
+ }
+}
+
+static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row) {
+ // Note: vst1q is faster than vst2q
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += stride - 24;
+}
+
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const uint16x8_t left_u16q = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16q);
+ const uint16x4_t left_high = vget_high_u16(left_u16q);
+ uint16x8_t row;
+
+ row = vdupq_lane_u16(left_low, 0);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 1);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 2);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 3);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 0);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 1);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 2);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 3);
+ h_store_32(&dst, stride, row);
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
+ const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
+ const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
+ const int16x8_t sub = vsubq_s16(above_s16, top_left);
+ int16x8_t sum;
+ uint16x8_t row;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+ sum = vaddq_s16(sum, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1_u16(dst, vget_low_u16(row));
+ dst += stride;
+ vst1_u16(dst, vget_high_u16(row));
+ dst += stride;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+ sum = vaddq_s16(sum, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1_u16(dst, vget_low_u16(row));
+ dst += stride;
+ vst1_u16(dst, vget_high_u16(row));
+}
+
+static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub,
+ const int16x8_t max) {
+ uint16x8_t row;
+ int16x8_t sum = vaddq_s16(left_dup, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1q_u16(*dst, row);
+ *dst += stride;
+}
+
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
+ const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
+ const int16x8_t sub = vsubq_s16(above_s16, top_left);
+ int16x4_t left_s16d;
+ int16x8_t left_dup;
+ int i;
+
+ left_s16d = vget_low_s16(left_s16);
+
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+ }
+}
+
+static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t max) {
+ uint16x8_t row0, row1;
+ int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ sum0 = vminq_s16(sum0, max);
+ sum1 = vminq_s16(sum1, max);
+ row0 = vqshluq_n_s16(sum0, 0);
+ row1 = vqshluq_n_s16(sum1, 0);
+ vst1q_u16(*dst, row0);
+ *dst += 8;
+ vst1q_u16(*dst, row1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+ const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+ const int16x8_t sub0 = vsubq_s16(above0, top_left);
+ const int16x8_t sub1 = vsubq_s16(above1, top_left);
+ int16x8_t left_dup;
+ int i, j;
+
+ for (j = 0; j < 2; j++, left += 8) {
+ const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+ }
+ }
+}
+
+static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t sub2,
+ const int16x8_t sub3, const int16x8_t max) {
+ uint16x8_t row0, row1, row2, row3;
+ int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+ int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+ sum0 = vminq_s16(sum0, max);
+ sum1 = vminq_s16(sum1, max);
+ sum2 = vminq_s16(sum2, max);
+ sum3 = vminq_s16(sum3, max);
+ row0 = vqshluq_n_s16(sum0, 0);
+ row1 = vqshluq_n_s16(sum1, 0);
+ row2 = vqshluq_n_s16(sum2, 0);
+ row3 = vqshluq_n_s16(sum3, 0);
+ vst1q_u16(*dst, row0);
+ *dst += 8;
+ vst1q_u16(*dst, row1);
+ *dst += 8;
+ vst1q_u16(*dst, row2);
+ *dst += 8;
+ vst1q_u16(*dst, row3);
+ *dst += stride - 24;
+}
+
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+ const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+ const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
+ const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
+ const int16x8_t sub0 = vsubq_s16(above0, top_left);
+ const int16x8_t sub1 = vsubq_s16(above1, top_left);
+ const int16x8_t sub2 = vsubq_s16(above2, top_left);
+ const int16x8_t sub3 = vsubq_s16(above3, top_left);
+ int16x8_t left_dup;
+ int i, j;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
new file mode 100644
index 0000000000..8d6e8acc4c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, uint16x8_t *blimit_vec,
+ uint16x8_t *limit_vec, uint16x8_t *thresh_vec,
+ const int bd) {
+ const int16x8_t shift = vdupq_n_s16(bd - 8);
+ *blimit_vec = vmovl_u8(vld1_dup_u8(blimit));
+ *limit_vec = vmovl_u8(vld1_dup_u8(limit));
+ *thresh_vec = vmovl_u8(vld1_dup_u8(thresh));
+ *blimit_vec = vshlq_u16(*blimit_vec, shift);
+ *limit_vec = vshlq_u16(*limit_vec, shift);
+ *thresh_vec = vshlq_u16(*thresh_vec, shift);
+}
+
+// Here flat is 128-bit long, with each 16-bit chunk being a mask of
+// a pixel. When used to control filter branches, we only detect whether it is
+// all 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
+// flat equals 0 if and only if flat_status equals 0.
+// flat equals -1 (all 1s) if and only if flat_status equals -4. (This is true
+// because each mask occupies more than 1 bit.)
+static INLINE uint32_t calc_flat_status(const uint16x8_t flat) {
+ const uint64x1_t t0 = vadd_u64(vreinterpret_u64_u16(vget_low_u16(flat)),
+ vreinterpret_u64_u16(vget_high_u16(flat)));
+ const uint64x1_t t1 = vpaddl_u32(vreinterpret_u32_u64(t0));
+ return vget_lane_u32(vreinterpret_u32_u64(t1), 0);
+}
+
+static INLINE uint16x8_t
+filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit,
+ const uint16x8_t thresh, const uint16x8_t p3,
+ const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
+ const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) {
+ uint16x8_t max, t0, t1;
+
+ max = vabdq_u16(p1, p0);
+ max = vmaxq_u16(max, vabdq_u16(q1, q0));
+ *hev = vcgtq_u16(max, thresh);
+ *mask = vmaxq_u16(max, vabdq_u16(p3, p2));
+ *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1));
+ *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1));
+ *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2));
+ t0 = vabdq_u16(p0, q0);
+ t1 = vabdq_u16(p1, q1);
+ t0 = vaddq_u16(t0, t0);
+ t1 = vshrq_n_u16(t1, 1);
+ t0 = vaddq_u16(t0, t1);
+ *mask = vcleq_u16(*mask, limit);
+ t0 = vcleq_u16(t0, blimit);
+ *mask = vandq_u16(*mask, t0);
+
+ return max;
+}
+
+static INLINE uint16x8_t filter_flat_hev_mask(
+ const uint16x8_t limit, const uint16x8_t blimit, const uint16x8_t thresh,
+ const uint16x8_t p3, const uint16x8_t p2, const uint16x8_t p1,
+ const uint16x8_t p0, const uint16x8_t q0, const uint16x8_t q1,
+ const uint16x8_t q2, const uint16x8_t q3, uint16x8_t *flat,
+ uint32_t *flat_status, uint16x8_t *hev, const int bd) {
+ uint16x8_t mask;
+ const uint16x8_t max = filter_hev_mask4(limit, blimit, thresh, p3, p2, p1, p0,
+ q0, q1, q2, q3, hev, &mask);
+ *flat = vmaxq_u16(max, vabdq_u16(p2, p0));
+ *flat = vmaxq_u16(*flat, vabdq_u16(q2, q0));
+ *flat = vmaxq_u16(*flat, vabdq_u16(p3, p0));
+ *flat = vmaxq_u16(*flat, vabdq_u16(q3, q0));
+ *flat = vcleq_u16(*flat, vdupq_n_u16(1 << (bd - 8))); /* flat_mask4() */
+ *flat = vandq_u16(*flat, mask);
+ *flat_status = calc_flat_status(*flat);
+
+ return mask;
+}
+
+static INLINE uint16x8_t flat_mask5(const uint16x8_t p4, const uint16x8_t p3,
+ const uint16x8_t p2, const uint16x8_t p1,
+ const uint16x8_t p0, const uint16x8_t q0,
+ const uint16x8_t q1, const uint16x8_t q2,
+ const uint16x8_t q3, const uint16x8_t q4,
+ const uint16x8_t flat,
+ uint32_t *flat2_status, const int bd) {
+ uint16x8_t flat2 = vabdq_u16(p4, p0);
+ flat2 = vmaxq_u16(flat2, vabdq_u16(p3, p0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(p2, p0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(p1, p0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(q1, q0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(q2, q0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(q3, q0));
+ flat2 = vmaxq_u16(flat2, vabdq_u16(q4, q0));
+ flat2 = vcleq_u16(flat2, vdupq_n_u16(1 << (bd - 8)));
+ flat2 = vandq_u16(flat2, flat);
+ *flat2_status = calc_flat_status(flat2);
+
+ return flat2;
+}
+
+static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) {
+ const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8));
+ return vreinterpretq_s16_u16(vsubq_u16(v, offset));
+}
+
+static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) {
+ const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8));
+ return vreinterpretq_u16_s16(vaddq_s16(v, offset));
+}
+
+static INLINE void filter_update(const uint16x8_t sub0, const uint16x8_t sub1,
+ const uint16x8_t add0, const uint16x8_t add1,
+ uint16x8_t *sum) {
+ *sum = vsubq_u16(*sum, sub0);
+ *sum = vsubq_u16(*sum, sub1);
+ *sum = vaddq_u16(*sum, add0);
+ *sum = vaddq_u16(*sum, add1);
+}
+
+static INLINE uint16x8_t calc_7_tap_filter_kernel(const uint16x8_t sub0,
+ const uint16x8_t sub1,
+ const uint16x8_t add0,
+ const uint16x8_t add1,
+ uint16x8_t *sum) {
+ filter_update(sub0, sub1, add0, add1, sum);
+ return vrshrq_n_u16(*sum, 3);
+}
+
+static INLINE uint16x8_t apply_15_tap_filter_kernel(
+ const uint16x8_t flat, const uint16x8_t sub0, const uint16x8_t sub1,
+ const uint16x8_t add0, const uint16x8_t add1, const uint16x8_t in,
+ uint16x8_t *sum) {
+ filter_update(sub0, sub1, add0, add1, sum);
+ return vbslq_u16(flat, vrshrq_n_u16(*sum, 4), in);
+}
+
+// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+static INLINE void calc_7_tap_filter(const uint16x8_t p3, const uint16x8_t p2,
+ const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1,
+ const uint16x8_t q2, const uint16x8_t q3,
+ uint16x8_t *op2, uint16x8_t *op1,
+ uint16x8_t *op0, uint16x8_t *oq0,
+ uint16x8_t *oq1, uint16x8_t *oq2) {
+ uint16x8_t sum;
+ sum = vaddq_u16(p3, p3); // 2*p3
+ sum = vaddq_u16(sum, p3); // 3*p3
+ sum = vaddq_u16(sum, p2); // 3*p3+p2
+ sum = vaddq_u16(sum, p2); // 3*p3+2*p2
+ sum = vaddq_u16(sum, p1); // 3*p3+2*p2+p1
+ sum = vaddq_u16(sum, p0); // 3*p3+2*p2+p1+p0
+ sum = vaddq_u16(sum, q0); // 3*p3+2*p2+p1+p0+q0
+ *op2 = vrshrq_n_u16(sum, 3);
+ *op1 = calc_7_tap_filter_kernel(p3, p2, p1, q1, &sum);
+ *op0 = calc_7_tap_filter_kernel(p3, p1, p0, q2, &sum);
+ *oq0 = calc_7_tap_filter_kernel(p3, p0, q0, q3, &sum);
+ *oq1 = calc_7_tap_filter_kernel(p2, q0, q1, q3, &sum);
+ *oq2 = calc_7_tap_filter_kernel(p1, q1, q2, q3, &sum);
+}
+
+static INLINE void apply_7_tap_filter(const uint16x8_t flat,
+ const uint16x8_t p3, const uint16x8_t p2,
+ const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1,
+ const uint16x8_t q2, const uint16x8_t q3,
+ uint16x8_t *op2, uint16x8_t *op1,
+ uint16x8_t *op0, uint16x8_t *oq0,
+ uint16x8_t *oq1, uint16x8_t *oq2) {
+ uint16x8_t tp1, tp0, tq0, tq1;
+ calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, &tq0, &tq1,
+ oq2);
+ *op2 = vbslq_u16(flat, *op2, p2);
+ *op1 = vbslq_u16(flat, tp1, *op1);
+ *op0 = vbslq_u16(flat, tp0, *op0);
+ *oq0 = vbslq_u16(flat, tq0, *oq0);
+ *oq1 = vbslq_u16(flat, tq1, *oq1);
+ *oq2 = vbslq_u16(flat, *oq2, q2);
+}
+
+// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+static INLINE void apply_15_tap_filter(
+ const uint16x8_t flat2, const uint16x8_t p7, const uint16x8_t p6,
+ const uint16x8_t p5, const uint16x8_t p4, const uint16x8_t p3,
+ const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
+ const uint16x8_t q3, const uint16x8_t q4, const uint16x8_t q5,
+ const uint16x8_t q6, const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5,
+ uint16x8_t *op4, uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1,
+ uint16x8_t *op0, uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
+ uint16x8_t *oq3, uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6) {
+ uint16x8_t sum;
+ sum = vshlq_n_u16(p7, 3); // 8*p7
+ sum = vsubq_u16(sum, p7); // 7*p7
+ sum = vaddq_u16(sum, p6); // 7*p7+p6
+ sum = vaddq_u16(sum, p6); // 7*p7+2*p6
+ sum = vaddq_u16(sum, p5); // 7*p7+2*p6+p5
+ sum = vaddq_u16(sum, p4); // 7*p7+2*p6+p5+p4
+ sum = vaddq_u16(sum, p3); // 7*p7+2*p6+p5+p4+p3
+ sum = vaddq_u16(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2
+ sum = vaddq_u16(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1
+ sum = vaddq_u16(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+ sum = vaddq_u16(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+ *op6 = vbslq_u16(flat2, vrshrq_n_u16(sum, 4), p6);
+ *op5 = apply_15_tap_filter_kernel(flat2, p7, p6, p5, q1, p5, &sum);
+ *op4 = apply_15_tap_filter_kernel(flat2, p7, p5, p4, q2, p4, &sum);
+ *op3 = apply_15_tap_filter_kernel(flat2, p7, p4, p3, q3, p3, &sum);
+ *op2 = apply_15_tap_filter_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
+ *op1 = apply_15_tap_filter_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
+ *op0 = apply_15_tap_filter_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
+ *oq0 = apply_15_tap_filter_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
+ *oq1 = apply_15_tap_filter_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
+ *oq2 = apply_15_tap_filter_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
+ *oq3 = apply_15_tap_filter_kernel(flat2, p4, q2, q3, q7, q3, &sum);
+ *oq4 = apply_15_tap_filter_kernel(flat2, p3, q3, q4, q7, q4, &sum);
+ *oq5 = apply_15_tap_filter_kernel(flat2, p2, q4, q5, q7, q5, &sum);
+ *oq6 = apply_15_tap_filter_kernel(flat2, p1, q5, q6, q7, q6, &sum);
+}
+
+static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev,
+ const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1,
+ uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0,
+ uint16x8_t *oq1, const int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1);
+ const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1)));
+ int16x8_t filter, filter1, filter2, t;
+ int16x8_t ps1 = flip_sign(p1, bd);
+ int16x8_t ps0 = flip_sign(p0, bd);
+ int16x8_t qs0 = flip_sign(q0, bd);
+ int16x8_t qs1 = flip_sign(q1, bd);
+
+ /* add outer taps if we have high edge variance */
+ filter = vsubq_s16(ps1, qs1);
+ filter = vmaxq_s16(filter, min);
+ filter = vminq_s16(filter, max);
+ filter = vandq_s16(filter, vreinterpretq_s16_u16(hev));
+ t = vsubq_s16(qs0, ps0);
+
+ /* inner taps */
+ filter = vaddq_s16(filter, t);
+ filter = vaddq_s16(filter, t);
+ filter = vaddq_s16(filter, t);
+ filter = vmaxq_s16(filter, min);
+ filter = vminq_s16(filter, max);
+ filter = vandq_s16(filter, vreinterpretq_s16_u16(mask));
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ /* if it equals 4 we'll set it to adjust by -1 to account for the fact */
+ /* we'd round it by 3 the other way */
+ t = vaddq_s16(filter, vdupq_n_s16(4));
+ t = vminq_s16(t, max);
+ filter1 = vshrq_n_s16(t, 3);
+ t = vaddq_s16(filter, vdupq_n_s16(3));
+ t = vminq_s16(t, max);
+ filter2 = vshrq_n_s16(t, 3);
+
+ qs0 = vsubq_s16(qs0, filter1);
+ qs0 = vmaxq_s16(qs0, min);
+ qs0 = vminq_s16(qs0, max);
+ ps0 = vaddq_s16(ps0, filter2);
+ ps0 = vmaxq_s16(ps0, min);
+ ps0 = vminq_s16(ps0, max);
+ *oq0 = flip_sign_back(qs0, bd);
+ *op0 = flip_sign_back(ps0, bd);
+
+ /* outer tap adjustments */
+ filter = vrshrq_n_s16(filter1, 1);
+ filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev));
+
+ qs1 = vsubq_s16(qs1, filter);
+ qs1 = vmaxq_s16(qs1, min);
+ qs1 = vminq_s16(qs1, max);
+ ps1 = vaddq_s16(ps1, filter);
+ ps1 = vmaxq_s16(ps1, min);
+ ps1 = vminq_s16(ps1, max);
+ *oq1 = flip_sign_back(qs1, bd);
+ *op1 = flip_sign_back(ps1, bd);
+}
+
+static INLINE void filter8(const uint16x8_t mask, const uint16x8_t flat,
+ const uint32_t flat_status, const uint16x8_t hev,
+ const uint16x8_t p3, const uint16x8_t p2,
+ const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1,
+ const uint16x8_t q2, const uint16x8_t q3,
+ uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
+ uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
+ const int bd) {
+ if (flat_status != (uint32_t)-4) {
+ filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
+ *op2 = p2;
+ *oq2 = q2;
+ if (flat_status) {
+ apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
+ oq0, oq1, oq2);
+ }
+ } else {
+ calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, oq0, oq1,
+ oq2);
+ }
+}
+
+static INLINE void filter16(
+ const uint16x8_t mask, const uint16x8_t flat, const uint32_t flat_status,
+ const uint16x8_t flat2, const uint32_t flat2_status, const uint16x8_t hev,
+ const uint16x8_t p7, const uint16x8_t p6, const uint16x8_t p5,
+ const uint16x8_t p4, const uint16x8_t p3, const uint16x8_t p2,
+ const uint16x8_t p1, const uint16x8_t p0, const uint16x8_t q0,
+ const uint16x8_t q1, const uint16x8_t q2, const uint16x8_t q3,
+ const uint16x8_t q4, const uint16x8_t q5, const uint16x8_t q6,
+ const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, uint16x8_t *op4,
+ uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
+ uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, uint16x8_t *oq3,
+ uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6, const int bd) {
+ if (flat_status != (uint32_t)-4) {
+ filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
+ }
+
+ if (flat_status) {
+ *op2 = p2;
+ *oq2 = q2;
+ if (flat2_status != (uint32_t)-4) {
+ apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
+ oq0, oq1, oq2);
+ }
+ if (flat2_status) {
+ apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3,
+ q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0,
+ oq0, oq1, oq2, oq3, oq4, oq5, oq6);
+ }
+ }
+}
+
+static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3,
+ uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0,
+ uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2,
+ uint16x8_t *q3) {
+ *p3 = vld1q_u16(s);
+ s += p;
+ *p2 = vld1q_u16(s);
+ s += p;
+ *p1 = vld1q_u16(s);
+ s += p;
+ *p0 = vld1q_u16(s);
+ s += p;
+ *q0 = vld1q_u16(s);
+ s += p;
+ *q1 = vld1q_u16(s);
+ s += p;
+ *q2 = vld1q_u16(s);
+ s += p;
+ *q3 = vld1q_u16(s);
+}
+
+static INLINE void load_8x16(const uint16_t *s, const int p, uint16x8_t *s0,
+ uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3,
+ uint16x8_t *s4, uint16x8_t *s5, uint16x8_t *s6,
+ uint16x8_t *s7, uint16x8_t *s8, uint16x8_t *s9,
+ uint16x8_t *s10, uint16x8_t *s11, uint16x8_t *s12,
+ uint16x8_t *s13, uint16x8_t *s14,
+ uint16x8_t *s15) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+ s += p;
+ *s7 = vld1q_u16(s);
+ s += p;
+ *s8 = vld1q_u16(s);
+ s += p;
+ *s9 = vld1q_u16(s);
+ s += p;
+ *s10 = vld1q_u16(s);
+ s += p;
+ *s11 = vld1q_u16(s);
+ s += p;
+ *s12 = vld1q_u16(s);
+ s += p;
+ *s13 = vld1q_u16(s);
+ s += p;
+ *s14 = vld1q_u16(s);
+ s += p;
+ *s15 = vld1q_u16(s);
+}
+
+static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0,
+ const uint16x8_t s1, const uint16x8_t s2,
+ const uint16x8_t s3) {
+ vst1q_u16(s, s0);
+ s += p;
+ vst1q_u16(s, s1);
+ s += p;
+ vst1q_u16(s, s2);
+ s += p;
+ vst1q_u16(s, s3);
+}
+
+static INLINE void store_8x6(uint16_t *s, const int p, const uint16x8_t s0,
+ const uint16x8_t s1, const uint16x8_t s2,
+ const uint16x8_t s3, const uint16x8_t s4,
+ const uint16x8_t s5) {
+ vst1q_u16(s, s0);
+ s += p;
+ vst1q_u16(s, s1);
+ s += p;
+ vst1q_u16(s, s2);
+ s += p;
+ vst1q_u16(s, s3);
+ s += p;
+ vst1q_u16(s, s4);
+ s += p;
+ vst1q_u16(s, s5);
+}
+
+static INLINE void store_4x8(uint16_t *s, const int p, const uint16x8_t p1,
+ const uint16x8_t p0, const uint16x8_t q0,
+ const uint16x8_t q1) {
+ uint16x8x4_t o;
+
+ o.val[0] = p1;
+ o.val[1] = p0;
+ o.val[2] = q0;
+ o.val[3] = q1;
+ vst4q_lane_u16(s, o, 0);
+ s += p;
+ vst4q_lane_u16(s, o, 1);
+ s += p;
+ vst4q_lane_u16(s, o, 2);
+ s += p;
+ vst4q_lane_u16(s, o, 3);
+ s += p;
+ vst4q_lane_u16(s, o, 4);
+ s += p;
+ vst4q_lane_u16(s, o, 5);
+ s += p;
+ vst4q_lane_u16(s, o, 6);
+ s += p;
+ vst4q_lane_u16(s, o, 7);
+}
+
+static INLINE void store_6x8(uint16_t *s, const int p, const uint16x8_t s0,
+ const uint16x8_t s1, const uint16x8_t s2,
+ const uint16x8_t s3, const uint16x8_t s4,
+ const uint16x8_t s5) {
+ uint16x8x3_t o0, o1;
+
+ o0.val[0] = s0;
+ o0.val[1] = s1;
+ o0.val[2] = s2;
+ o1.val[0] = s3;
+ o1.val[1] = s4;
+ o1.val[2] = s5;
+ vst3q_lane_u16(s - 3, o0, 0);
+ vst3q_lane_u16(s + 0, o1, 0);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 1);
+ vst3q_lane_u16(s + 0, o1, 1);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 2);
+ vst3q_lane_u16(s + 0, o1, 2);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 3);
+ vst3q_lane_u16(s + 0, o1, 3);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 4);
+ vst3q_lane_u16(s + 0, o1, 4);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 5);
+ vst3q_lane_u16(s + 0, o1, 5);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 6);
+ vst3q_lane_u16(s + 0, o1, 6);
+ s += p;
+ vst3q_lane_u16(s - 3, o0, 7);
+ vst3q_lane_u16(s + 0, o1, 7);
+}
+
+static INLINE void store_7x8(uint16_t *s, const int p, const uint16x8_t s0,
+ const uint16x8_t s1, const uint16x8_t s2,
+ const uint16x8_t s3, const uint16x8_t s4,
+ const uint16x8_t s5, const uint16x8_t s6) {
+ uint16x8x4_t o0;
+ uint16x8x3_t o1;
+
+ o0.val[0] = s0;
+ o0.val[1] = s1;
+ o0.val[2] = s2;
+ o0.val[3] = s3;
+ o1.val[0] = s4;
+ o1.val[1] = s5;
+ o1.val[2] = s6;
+ vst4q_lane_u16(s - 4, o0, 0);
+ vst3q_lane_u16(s + 0, o1, 0);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 1);
+ vst3q_lane_u16(s + 0, o1, 1);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 2);
+ vst3q_lane_u16(s + 0, o1, 2);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 3);
+ vst3q_lane_u16(s + 0, o1, 3);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 4);
+ vst3q_lane_u16(s + 0, o1, 4);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 5);
+ vst3q_lane_u16(s + 0, o1, 5);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 6);
+ vst3q_lane_u16(s + 0, o1, 6);
+ s += p;
+ vst4q_lane_u16(s - 4, o0, 7);
+ vst3q_lane_u16(s + 0, o1, 7);
+}
+
+static INLINE void store_8x14(uint16_t *s, const int p, const uint16x8_t p6,
+ const uint16x8_t p5, const uint16x8_t p4,
+ const uint16x8_t p3, const uint16x8_t p2,
+ const uint16x8_t p1, const uint16x8_t p0,
+ const uint16x8_t q0, const uint16x8_t q1,
+ const uint16x8_t q2, const uint16x8_t q3,
+ const uint16x8_t q4, const uint16x8_t q5,
+ const uint16x8_t q6, const uint32_t flat_status,
+ const uint32_t flat2_status) {
+ if (flat_status) {
+ if (flat2_status) {
+ vst1q_u16(s - 7 * p, p6);
+ vst1q_u16(s - 6 * p, p5);
+ vst1q_u16(s - 5 * p, p4);
+ vst1q_u16(s - 4 * p, p3);
+ vst1q_u16(s + 3 * p, q3);
+ vst1q_u16(s + 4 * p, q4);
+ vst1q_u16(s + 5 * p, q5);
+ vst1q_u16(s + 6 * p, q6);
+ }
+ vst1q_u16(s - 3 * p, p2);
+ vst1q_u16(s + 2 * p, q2);
+ }
+ vst1q_u16(s - 2 * p, p1);
+ vst1q_u16(s - 1 * p, p0);
+ vst1q_u16(s + 0 * p, q0);
+ vst1q_u16(s + 1 * p, q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ mask, hev;
+
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+ q2, q3, &hev, &mask);
+ filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
+ store_8x4(s - 2 * p, p, p1, p0, q0, q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_neon(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ mask, hev;
+
+ load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
+ (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
+ (int16x8_t *)&q2, (int16x8_t *)&q3);
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+ q2, q3, &hev, &mask);
+ filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
+ store_4x8(s - 2, p, p1, p0, q0, q1);
+}
+
+void vpx_highbd_lpf_vertical_4_dual_neon(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+ q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+ filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2, bd);
+ store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_neon(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
+ (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
+ (int16x8_t *)&q2, (int16x8_t *)&q3);
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+ q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+ filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2, bd);
+ // Note: store_6x8() is faster than transpose + store_8x8().
+ store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_highbd_lpf_vertical_8_dual_neon(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
+}
+
+// Quiet warnings of the form: 'vpx_dsp/arm/highbd_loopfilter_neon.c|675 col 67|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+static void lpf_horizontal_16_kernel(uint16_t *s, int p,
+ const uint16x8_t blimit_vec,
+ const uint16x8_t limit_vec,
+ const uint16x8_t thresh_vec,
+ const int bd) {
+ uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
+ q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+ oq4, oq5, oq6;
+ uint32_t flat_status, flat2_status;
+
+ load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
+ &q3, &q4, &q5, &q6, &q7);
+ mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+ q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+ flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
+ &flat2_status, bd);
+ filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
+ p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
+ &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+ bd);
+ store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
+ oq5, oq6, flat_status, flat2_status);
+}
+
+static void lpf_vertical_16_kernel(uint16_t *s, int p,
+ const uint16x8_t blimit_vec,
+ const uint16x8_t limit_vec,
+ const uint16x8_t thresh_vec, const int bd) {
+ uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
+ q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+ oq4, oq5, oq6;
+ uint32_t flat_status, flat2_status;
+
+ load_8x8(s - 8, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
+ transpose_s16_8x8((int16x8_t *)&p7, (int16x8_t *)&p6, (int16x8_t *)&p5,
+ (int16x8_t *)&p4, (int16x8_t *)&p3, (int16x8_t *)&p2,
+ (int16x8_t *)&p1, (int16x8_t *)&p0);
+ load_8x8(s, p, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ transpose_s16_8x8((int16x8_t *)&q0, (int16x8_t *)&q1, (int16x8_t *)&q2,
+ (int16x8_t *)&q3, (int16x8_t *)&q4, (int16x8_t *)&q5,
+ (int16x8_t *)&q6, (int16x8_t *)&q7);
+ mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+ q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+ flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
+ &flat2_status, bd);
+ filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
+ p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
+ &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+ bd);
+ if (flat_status) {
+ if (flat2_status) {
+ store_7x8(s - 3, p, op6, op5, op4, op3, op2, op1, op0);
+ store_7x8(s + 4, p, oq0, oq1, oq2, oq3, oq4, oq5, oq6);
+ } else {
+ // Note: store_6x8() is faster than transpose + store_8x8().
+ store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
+ }
+ } else {
+ store_4x8(s - 2, p, op1, op0, oq0, oq1);
+ }
+}
+
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+ lpf_horizontal_16_kernel(s + 8, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+ lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+ lpf_vertical_16_kernel(s + 8 * p, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 0000000000..d2a7add60d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
+ const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
+ tran_low_t *dqcoeff_ptr) {
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_8_neon(
+ const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin,
+ const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift,
+ int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) {
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31);
+ const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31);
+ const int32x4_t coeff_0_abs = vabsq_s32(coeff_0);
+ const int32x4_t coeff_1_abs = vabsq_s32(coeff_1);
+
+ // Calculate 2 masks of elements outside the bin
+ const int32x4_t zbin_mask_0 =
+ vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin));
+ const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32(
+ vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1)));
+
+ // Get the rounded values
+ const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round);
+ const int32x4_t rounded_1 =
+ vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1));
+
+ // (round * (quant << 15) * 2) >> 16 == (round * quant)
+ int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant);
+ int32x4_t qcoeff_tmp_1 =
+ vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1));
+
+ // Add rounded values
+ qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0);
+ qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1);
+
+ // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift)
+ qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift);
+ qcoeff_tmp_1 =
+ vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1));
+
+ // Restore the sign bit.
+ qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign);
+ qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign);
+ qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign);
+ qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign);
+
+ // Only keep the relevant coeffs
+ *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0);
+ *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t
+highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int32x4_t zbin,
+ const int32x4_t round, const int32x4_t quant,
+ const int32x4_t quant_shift, const int32x4_t dequant) {
+ int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+ const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+ highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+ &qcoeff_0, &qcoeff_1);
+
+ // Store the 32-bit qcoeffs
+ vst1q_s32(qcoeff_ptr, qcoeff_0);
+ vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+ // Calculate and store the dqcoeffs
+ dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+ dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+ highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+ return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+
+ // Only the first element of each vector is DC.
+ // High half has identical elements, but we can reconstruct it from the low
+ // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+ // vector
+ int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr));
+ int32x4_t round = vmovl_s16(vld1_s16(round_ptr));
+ // Extend the quant, quant_shift vectors to ones of 32-bit elements
+ // scale to high-half, so we can use vqdmulhq_s32
+ int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+ int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15);
+ int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ n_coeffs -= 8;
+
+ {
+ zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+ round = vdupq_lane_s32(vget_low_s32(round), 1);
+ quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+ quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+ dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+ do {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ n_coeffs -= 8;
+ } while (n_coeffs > 0);
+ }
+
+#if VPX_ARCH_AARCH64
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // VPX_ARCH_AARCH64
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)n_coeffs;
+ (void)scan;
+}
+
+static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32(
+ int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) {
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+ dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+ dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round,
+ const int32x4_t quant, const int32x4_t quant_shift,
+ const int32x4_t dequant) {
+ int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+ // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+ const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+ const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+ highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+ &qcoeff_0, &qcoeff_1);
+
+ // Store the 32-bit qcoeffs
+ vst1q_s32(qcoeff_ptr, qcoeff_0);
+ vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+ // Calculate and store the dqcoeffs
+ dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+ dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+ highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+ return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+ int i;
+ const int16_t *iscan = scan_order->iscan;
+
+ // Only the first element of each vector is DC.
+ // High half has identical elements, but we can reconstruct it from the low
+ // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+ // vector
+ int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1);
+ int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1);
+ // Extend the quant, quant_shift vectors to ones of 32-bit elements
+ // scale to high-half, so we can use vqdmulhq_s32
+ int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+ int32x4_t quant_shift =
+ vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16);
+ int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ {
+ zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+ round = vdupq_lane_s32(vget_low_s32(round), 1);
+ quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+ quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+ dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+ for (i = 1; i < 32 * 32 / 8; ++i) {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+ round, quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+ }
+
+#if VPX_ARCH_AARCH64
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // VPX_ARCH_AARCH64
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c
new file mode 100644
index 0000000000..a6684b0534
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+ uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+ uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+ uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+ uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+ sum[0] = vabal_u16(sum[0], s, r0);
+ sum[1] = vabal_u16(sum[1], s, r1);
+ sum[2] = vabal_u16(sum[2], s, r2);
+ sum[3] = vabal_u16(sum[3], s, r3);
+
+ } while (++i < h);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+ sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+ sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+ sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+ sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+ } while (++i < h);
+
+ sum_u32[0] = vpaddlq_u16(sum[0]);
+ sum_u32[1] = vpaddlq_u16(sum[1]);
+ sum_u32[2] = vpaddlq_u16(sum[2]);
+ sum_u32[3] = vpaddlq_u16(sum[3]);
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+ uint32x4_t *const sad_sum) {
+ uint16x8_t abs_diff = vabdq_u16(src, ref);
+ *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ uint16x8_t s0, s1;
+
+ s0 = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+ s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4], int w,
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0, s1, s2, s3;
+
+ s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+ s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+ s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+ sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+ &sum_lo[0]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+ &sum_lo[1]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+ &sum_lo[2]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+ &sum_lo[3]);
+
+ s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+ sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+ &sum_hi[0]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+ &sum_hi[1]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+ &sum_hi[2]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+ &sum_hi[3]);
+
+ j += 32;
+ } while (j < w);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h);
+}
+
+static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h);
+}
+
+#define HBD_SAD_WXH_4D_NEON(w, h) \
+ void vpx_highbd_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+HBD_SAD_WXH_4D_NEON(4, 4)
+HBD_SAD_WXH_4D_NEON(4, 8)
+
+HBD_SAD_WXH_4D_NEON(8, 4)
+HBD_SAD_WXH_4D_NEON(8, 8)
+HBD_SAD_WXH_4D_NEON(8, 16)
+
+HBD_SAD_WXH_4D_NEON(16, 8)
+HBD_SAD_WXH_4D_NEON(16, 16)
+HBD_SAD_WXH_4D_NEON(16, 32)
+
+HBD_SAD_WXH_4D_NEON(32, 16)
+HBD_SAD_WXH_4D_NEON(32, 32)
+HBD_SAD_WXH_4D_NEON(32, 64)
+
+HBD_SAD_WXH_4D_NEON(64, 32)
+HBD_SAD_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_WXH_4D_NEON
+
+#define HBD_SAD_SKIP_WXH_4D_NEON(w, h) \
+ void vpx_highbd_sad_skip_##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, ((h) >> 1)); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+HBD_SAD_SKIP_WXH_4D_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_4D_NEON
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 0000000000..b99bac66cd
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr);
+ uint16x4_t r = vld1_u16(ref16_ptr);
+ sum = vabal_u16(sum, s, r);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ sum = vabaq_u16(sum, s, r);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint16x8_t s0, s1, r0, r1;
+ uint16x8_t diff0, diff1;
+
+ s0 = vld1q_u16(src16_ptr);
+ r0 = vld1q_u16(ref16_ptr);
+ diff0 = vabdq_u16(s0, r0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + 8);
+ r1 = vld1q_u16(ref16_ptr + 8);
+ diff1 = vabdq_u16(s1, r1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3;
+ uint16x8_t diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u16(src16_ptr + j);
+ r0 = vld1q_u16(ref16_ptr + j);
+ diff0 = vabdq_u16(s0, r0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + j + 8);
+ r1 = vld1q_u16(ref16_ptr + j + 8);
+ diff1 = vabdq_u16(s1, r1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ s2 = vld1q_u16(src16_ptr + j + 16);
+ r2 = vld1q_u16(ref16_ptr + j + 16);
+ diff2 = vabdq_u16(s2, r2);
+ sum[2] = vpadalq_u16(sum[2], diff2);
+
+ s3 = vld1q_u16(src16_ptr + j + 24);
+ r3 = vld1q_u16(ref16_ptr + j + 24);
+ diff3 = vabdq_u16(s3, r3);
+ sum[3] = vpadalq_u16(sum[3], diff3);
+
+ j += 32;
+ } while (j < w);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+
+ return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+#define HBD_SAD_WXH_NEON(w, h) \
+ unsigned int vpx_highbd_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
+ }
+
+HBD_SAD_WXH_NEON(4, 4)
+HBD_SAD_WXH_NEON(4, 8)
+
+HBD_SAD_WXH_NEON(8, 4)
+HBD_SAD_WXH_NEON(8, 8)
+HBD_SAD_WXH_NEON(8, 16)
+
+HBD_SAD_WXH_NEON(16, 8)
+HBD_SAD_WXH_NEON(16, 16)
+HBD_SAD_WXH_NEON(16, 32)
+
+HBD_SAD_WXH_NEON(32, 16)
+HBD_SAD_WXH_NEON(32, 32)
+HBD_SAD_WXH_NEON(32, 64)
+
+HBD_SAD_WXH_NEON(64, 32)
+HBD_SAD_WXH_NEON(64, 64)
+
+#undef HBD_SAD_WXH_NEON
+
+#define HBD_SAD_SKIP_WXH_NEON(w, h) \
+ unsigned int vpx_highbd_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad##w##xh_neon(src, 2 * src_stride, ref, \
+ 2 * ref_stride, (h) / 2); \
+ }
+
+HBD_SAD_SKIP_WXH_NEON(4, 4)
+HBD_SAD_SKIP_WXH_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_NEON(8, 4)
+HBD_SAD_SKIP_WXH_NEON(8, 8)
+HBD_SAD_SKIP_WXH_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_NEON(16, 8)
+HBD_SAD_SKIP_WXH_NEON(16, 16)
+HBD_SAD_SKIP_WXH_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_NEON(32, 16)
+HBD_SAD_SKIP_WXH_NEON(32, 32)
+HBD_SAD_SKIP_WXH_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_NEON(64, 32)
+HBD_SAD_SKIP_WXH_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_NEON
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr);
+ uint16x4_t r = vld1_u16(ref16_ptr);
+ uint16x4_t p = vld1_u16(pred16_ptr);
+
+ uint16x4_t avg = vrhadd_u16(r, p);
+ sum = vabal_u16(sum, s, avg);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 4;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ uint16x8_t p = vld1q_u16(pred16_ptr);
+
+ uint16x8_t avg = vrhaddq_u16(r, p);
+ uint16x8_t diff = vabdq_u16(s, avg);
+ sum = vpadalq_u16(sum, diff);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 8;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint16x8_t s0, s1, r0, r1, p0, p1;
+ uint16x8_t avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u16(src16_ptr);
+ r0 = vld1q_u16(ref16_ptr);
+ p0 = vld1q_u16(pred16_ptr);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + 8);
+ r1 = vld1q_u16(ref16_ptr + 8);
+ p1 = vld1q_u16(pred16_ptr + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 16;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+ uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u16(src16_ptr + j);
+ r0 = vld1q_u16(ref16_ptr + j);
+ p0 = vld1q_u16(pred16_ptr + j);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + j + 8);
+ r1 = vld1q_u16(ref16_ptr + j + 8);
+ p1 = vld1q_u16(pred16_ptr + j + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ s2 = vld1q_u16(src16_ptr + j + 16);
+ r2 = vld1q_u16(ref16_ptr + j + 16);
+ p2 = vld1q_u16(pred16_ptr + j + 16);
+ avg2 = vrhaddq_u16(r2, p2);
+ diff2 = vabdq_u16(s2, avg2);
+ sum[2] = vpadalq_u16(sum[2], diff2);
+
+ s3 = vld1q_u16(src16_ptr + j + 24);
+ r3 = vld1q_u16(ref16_ptr + j + 24);
+ p3 = vld1q_u16(pred16_ptr + j + 24);
+ avg3 = vrhaddq_u16(r3, p3);
+ diff3 = vabdq_u16(s3, avg3);
+ sum[3] = vpadalq_u16(sum[3], diff3);
+
+ j += 32;
+ } while (j < w);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += w;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+
+ return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+ second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+ second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h) \
+ uint32_t vpx_highbd_sad##w##x##h##_avg_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000000..683df5797a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, blend);
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height,
+ int filter_offset) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, blend);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 8, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 16, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 32, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ \
+ return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+ w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+ h); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
+ src_stride, h, yoffset); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
+ xoffset); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+ uint16x4_t p = vld1_u16(second_pred);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ second_pred += 4;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint16_t *second_pred) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 8, dst_height,
+ filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+ uint16x8_t p = vld1q_u16(second_pred);
+ avg = vrhaddq_u16(avg, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+ int src_stride, int dst_width, int dst_height,
+ const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t avg = vrhaddq_u16(s, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ uint16_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp, source_stride, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp, source_stride, source_stride, h, yoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp0, source_stride, 1, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp0, source_stride, 1, h, xoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 0000000000..75fde676a0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+
+ int i = h;
+ do {
+ const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+ const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_int16x8(sum_s16);
+ *sse = horizontal_add_int32x4(sse_s32);
+}
+
+// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all
+// block sizes can be processed in 32-bit elements (1023*1023*64*16 = 1071645696
+// for a 64x64 block).
+static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t s = vld1q_u16(src_ptr + j);
+ const uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s32 = vpadalq_s16(sum_s32, diff);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = horizontal_long_add_uint32x4(vaddq_u32(
+ vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1])));
+}
+
+static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum);
+}
+
+static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum);
+}
+
+static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+// For 12-bit data, we can only accumulate up to 128 elements in the sum of
+// squares (4095*4095*128 = 2146435200), and because we're using two int32x4
+// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128)
+// or 16 64-element rows before we have to accumulate into 64-bit elements.
+// Therefore blocks of size 32x64, 64x32 and 64x64 are processed in a different
+// helper function.
+
+// Process a block of any size where the width is divisible by 8, with
+// accumulation into 64-bit elements.
+static INLINE void highbd_variance_xlarge_neon(
+ const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
+ int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+ // accumulator overflows. After hitting this limit we accumulate into 64-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ int i = 0;
+ do {
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ const uint16x8_t r0 = vld1q_u16(ref_ptr + j);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ sum_s32 = vpadalq_s16(sum_s32, diff);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ i++;
+ } while (i < h_tmp);
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]);
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]);
+ h_tmp += h_limit;
+ } while (i < h);
+
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = (uint64_t)horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_32xh_xlarge_neon(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse, int64_t *sum) {
+ highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse,
+ sum);
+}
+
+static INLINE void highbd_variance_64xh_xlarge_neon(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse, int64_t *sum) {
+ highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse,
+ sum);
+}
+
+#define HBD_VARIANCE_WXH_8_NEON(w, h) \
+ uint32_t vpx_highbd_8_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ sum = (int)sum_long; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+#define HBD_VARIANCE_WXH_10_NEON(w, h) \
+ uint32_t vpx_highbd_10_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HBD_VARIANCE_WXH_12_NEON(w, h) \
+ uint32_t vpx_highbd_12_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h) \
+ uint32_t vpx_highbd_12_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 4)
+HBD_VARIANCE_WXH_8_NEON(4, 8)
+
+HBD_VARIANCE_WXH_8_NEON(8, 4)
+HBD_VARIANCE_WXH_8_NEON(8, 8)
+HBD_VARIANCE_WXH_8_NEON(8, 16)
+
+HBD_VARIANCE_WXH_8_NEON(16, 8)
+HBD_VARIANCE_WXH_8_NEON(16, 16)
+HBD_VARIANCE_WXH_8_NEON(16, 32)
+
+HBD_VARIANCE_WXH_8_NEON(32, 16)
+HBD_VARIANCE_WXH_8_NEON(32, 32)
+HBD_VARIANCE_WXH_8_NEON(32, 64)
+
+HBD_VARIANCE_WXH_8_NEON(64, 32)
+HBD_VARIANCE_WXH_8_NEON(64, 64)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 4)
+HBD_VARIANCE_WXH_10_NEON(4, 8)
+
+HBD_VARIANCE_WXH_10_NEON(8, 4)
+HBD_VARIANCE_WXH_10_NEON(8, 8)
+HBD_VARIANCE_WXH_10_NEON(8, 16)
+
+HBD_VARIANCE_WXH_10_NEON(16, 8)
+HBD_VARIANCE_WXH_10_NEON(16, 16)
+HBD_VARIANCE_WXH_10_NEON(16, 32)
+
+HBD_VARIANCE_WXH_10_NEON(32, 16)
+HBD_VARIANCE_WXH_10_NEON(32, 32)
+HBD_VARIANCE_WXH_10_NEON(32, 64)
+
+HBD_VARIANCE_WXH_10_NEON(64, 32)
+HBD_VARIANCE_WXH_10_NEON(64, 64)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 4)
+HBD_VARIANCE_WXH_12_NEON(4, 8)
+
+HBD_VARIANCE_WXH_12_NEON(8, 4)
+HBD_VARIANCE_WXH_12_NEON(8, 8)
+HBD_VARIANCE_WXH_12_NEON(8, 16)
+
+HBD_VARIANCE_WXH_12_NEON(16, 8)
+HBD_VARIANCE_WXH_12_NEON(16, 16)
+HBD_VARIANCE_WXH_12_NEON(16, 32)
+
+HBD_VARIANCE_WXH_12_NEON(32, 16)
+HBD_VARIANCE_WXH_12_NEON(32, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64)
+
+#define HIGHBD_GET_VAR(S) \
+ void vpx_highbd_8_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ *sum = (int)sum_long; \
+ } \
+ \
+ void vpx_highbd_10_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ } \
+ \
+ void vpx_highbd_12_get##S##x##S##var_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ }
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ uint16x8_t diff = vabdq_u16(s, r);
+
+ sse_u32[0] =
+ vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff));
+ sse_u32[1] =
+ vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h / 2;
+ do {
+ uint16x8_t s0, s1, r0, r1;
+ uint8x16_t s, r, diff;
+
+ s0 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ s1 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ r0 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+ r1 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+
+ s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(sse_u32);
+ return *sse;
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s0, s1, r0, r1;
+ uint8x16_t s, r, diff;
+
+ s0 = vld1q_u16(src_ptr);
+ s1 = vld1q_u16(src_ptr + 8);
+ r0 = vld1q_u16(ref_ptr);
+ r1 = vld1q_u16(ref_ptr + 8);
+
+ s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(sse_u32);
+ return *sse;
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h,
+ sse);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h,
+ sse);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+#define HIGHBD_MSE_WXH_NEON(w, h) \
+ uint32_t vpx_highbd_8_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_10_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+ return *sse; \
+ } \
+ \
+ uint32_t vpx_highbd_12_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+ return *sse; \
+ }
+
+HIGHBD_MSE_WXH_NEON(16, 16)
+HIGHBD_MSE_WXH_NEON(16, 8)
+HIGHBD_MSE_WXH_NEON(8, 16)
+HIGHBD_MSE_WXH_NEON(8, 8)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
new file mode 100644
index 0000000000..47684473ca
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -0,0 +1,931 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+}
+
+static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+}
+
+static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+}
+
+static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3,
+ const uint16x8_t s4, const uint16x8_t s5,
+ const uint16x8_t s6, const uint16x8_t s7) {
+ vst1q_u16(s, s0);
+ s += p;
+ vst1q_u16(s, s1);
+ s += p;
+ vst1q_u16(s, s2);
+ s += p;
+ vst1q_u16(s, s3);
+ s += p;
+ vst1q_u16(s, s4);
+ s += p;
+ vst1q_u16(s, s5);
+ s += p;
+ vst1q_u16(s, s6);
+ s += p;
+ vst1q_u16(s, s7);
+}
+
+static INLINE int32x4_t highbd_convolve8_4(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int32x4_t sum;
+
+ sum = vmull_lane_s16(s0, filters_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
+ return sum;
+}
+
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filters, const uint16x8_t max) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int32x4_t sum0, sum1;
+ uint16x8_t d;
+
+ sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
+ sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
+ d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
+ d = vminq_u16(d, max);
+ return d;
+}
+
+void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ } else {
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ uint16x8_t t0, t1, t2, t3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+
+ src -= 3;
+
+ if (h == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int32x4_t d0, d1, d2, d3;
+ uint16x8_t d01, d23;
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u16_8x4(&t0, &t1, &t2, &t3);
+ s0 = vreinterpret_s16_u16(vget_low_u16(t0));
+ s1 = vreinterpret_s16_u16(vget_low_u16(t1));
+ s2 = vreinterpret_s16_u16(vget_low_u16(t2));
+ s3 = vreinterpret_s16_u16(vget_low_u16(t3));
+ s4 = vreinterpret_s16_u16(vget_high_u16(t0));
+ s5 = vreinterpret_s16_u16(vget_high_u16(t1));
+ s6 = vreinterpret_s16_u16(vget_high_u16(t2));
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ src += 7;
+
+ do {
+ load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
+ transpose_s16_4x4d(&s7, &s8, &s9, &s10);
+
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+ d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+ d01 = vminq_u16(d01, max);
+ d23 = vminq_u16(d23, max);
+ transpose_u16_4x4q(&d01, &d23);
+
+ vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
+ vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
+ vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w > 0);
+ } else {
+ int16x8_t t4, t5, t6, t7;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint16x8_t d0, d1, d2, d3;
+
+ if (w == 4) {
+ do {
+ load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+ &s5, &s6, &s7);
+ transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
+ &t4, &t5, &t6, &t7);
+ src += 8 * src_stride;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+ transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ d3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+ transpose_u16_8x4(&d0, &d1, &d2, &d3);
+ vst1_u16(dst, vget_low_u16(d0));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d1));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d2));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d3));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d0));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d1));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d2));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d3));
+ dst += dst_stride;
+ h -= 8;
+ } while (h > 0);
+ } else {
+ int width;
+ const uint16_t *s;
+ uint16_t *d;
+ int16x8_t s11, s12, s13, s14;
+ uint16x8_t d4, d5, d6, d7;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+ &s5, &s6, &s7);
+ transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ width = w;
+ s = src + 7;
+ d = dst;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+
+ do {
+ load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
+ &s12, &s13, &s14);
+ transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
+
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
+ max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
+ max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
+ max);
+ d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
+ max);
+ d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
+ max);
+ d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
+ max);
+ d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
+ max);
+ d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
+ filters, max);
+
+ transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+ store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+ }
+ }
+}
+
+void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4,
+ int y_step_q4, int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+ } else {
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+
+ src -= 3;
+
+ if (h == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int32x4_t d0, d1, d2, d3;
+ uint16x8_t t0, t1, t2, t3;
+ uint16x8_t d01, d23, t01, t23;
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u16_8x4(&t0, &t1, &t2, &t3);
+ s0 = vreinterpret_s16_u16(vget_low_u16(t0));
+ s1 = vreinterpret_s16_u16(vget_low_u16(t1));
+ s2 = vreinterpret_s16_u16(vget_low_u16(t2));
+ s3 = vreinterpret_s16_u16(vget_low_u16(t3));
+ s4 = vreinterpret_s16_u16(vget_high_u16(t0));
+ s5 = vreinterpret_s16_u16(vget_high_u16(t1));
+ s6 = vreinterpret_s16_u16(vget_high_u16(t2));
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ src += 7;
+
+ do {
+ load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
+ transpose_s16_4x4d(&s7, &s8, &s9, &s10);
+
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+ t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+ t01 = vminq_u16(t01, max);
+ t23 = vminq_u16(t23, max);
+ transpose_u16_4x4q(&t01, &t23);
+
+ d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+ vld1_u16(dst + 2 * dst_stride));
+ d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
+ vld1_u16(dst + 3 * dst_stride));
+ d01 = vrhaddq_u16(d01, t01);
+ d23 = vrhaddq_u16(d23, t23);
+
+ vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
+ vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
+ vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w > 0);
+ } else {
+ int16x8_t t4, t5, t6, t7;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+
+ if (w == 4) {
+ do {
+ load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+ &s5, &s6, &s7);
+ transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
+ &t4, &t5, &t6, &t7);
+ src += 8 * src_stride;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+ transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ t3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+ transpose_u16_8x4(&t0, &t1, &t2, &t3);
+
+ d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+ vld1_u16(dst + 4 * dst_stride));
+ d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
+ vld1_u16(dst + 5 * dst_stride));
+ d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
+ vld1_u16(dst + 6 * dst_stride));
+ d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
+ vld1_u16(dst + 7 * dst_stride));
+ d0 = vrhaddq_u16(d0, t0);
+ d1 = vrhaddq_u16(d1, t1);
+ d2 = vrhaddq_u16(d2, t2);
+ d3 = vrhaddq_u16(d3, t3);
+
+ vst1_u16(dst, vget_low_u16(d0));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d1));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d2));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d3));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d0));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d1));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d2));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d3));
+ dst += dst_stride;
+ h -= 8;
+ } while (h > 0);
+ } else {
+ int width;
+ const uint16_t *s;
+ uint16_t *d;
+ int16x8_t s11, s12, s13, s14;
+ uint16x8_t d4, d5, d6, d7;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+ &s5, &s6, &s7);
+ transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ width = w;
+ s = src + 7;
+ d = dst;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+
+ do {
+ load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
+ &s12, &s13, &s14);
+ transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
+
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
+ max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
+ max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
+ max);
+ d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
+ max);
+ d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
+ max);
+ d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
+ max);
+ d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
+ max);
+ d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
+ filters, max);
+
+ transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+ d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
+ d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
+ d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
+ d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
+
+ store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+ }
+ }
+}
+
+void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ } else {
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int32x4_t d0, d1, d2, d3;
+ uint16x8_t d01, d23;
+
+ s0 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s1 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s2 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s3 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s4 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s5 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s6 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+
+ do {
+ s7 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s8 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s9 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s10 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+ d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+ d01 = vminq_u16(d01, max);
+ d23 = vminq_u16(d23, max);
+ vst1_u16(dst, vget_low_u16(d01));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d01));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d23));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d23));
+ dst += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ int height;
+ const uint16_t *s;
+ uint16_t *d;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint16x8_t d0, d1, d2, d3;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ s = src;
+ s0 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s1 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s2 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s3 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s4 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s5 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s6 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ d = dst;
+ height = h;
+
+ do {
+ s7 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s8 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s9 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s10 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ d3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+ vst1q_u16(d, d0);
+ d += dst_stride;
+ vst1q_u16(d, d1);
+ d += dst_stride;
+ vst1q_u16(d, d2);
+ d += dst_stride;
+ vst1q_u16(d, d3);
+ d += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+ }
+}
+
+void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+ } else {
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int32x4_t d0, d1, d2, d3;
+ uint16x8_t d01, d23, t01, t23;
+
+ s0 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s1 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s2 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s3 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s4 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s5 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s6 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+
+ do {
+ s7 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s8 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s9 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+ s10 = vreinterpret_s16_u16(vld1_u16(src));
+ src += src_stride;
+
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+ t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+ t01 = vminq_u16(t01, max);
+ t23 = vminq_u16(t23, max);
+
+ d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+ vld1_u16(dst + 1 * dst_stride));
+ d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
+ vld1_u16(dst + 3 * dst_stride));
+ d01 = vrhaddq_u16(d01, t01);
+ d23 = vrhaddq_u16(d23, t23);
+
+ vst1_u16(dst, vget_low_u16(d01));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d01));
+ dst += dst_stride;
+ vst1_u16(dst, vget_low_u16(d23));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d23));
+ dst += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ int height;
+ const uint16_t *s;
+ uint16_t *d;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ s = src;
+ s0 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s1 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s2 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s3 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s4 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s5 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s6 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ d = dst;
+ height = h;
+
+ do {
+ s7 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s8 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s9 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+ s10 = vreinterpretq_s16_u16(vld1q_u16(s));
+ s += src_stride;
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ t3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+ d0 = vld1q_u16(d + 0 * dst_stride);
+ d1 = vld1q_u16(d + 1 * dst_stride);
+ d2 = vld1q_u16(d + 2 * dst_stride);
+ d3 = vld1q_u16(d + 3 * dst_stride);
+ d0 = vrhaddq_u16(d0, t0);
+ d1 = vrhaddq_u16(d1, t1);
+ d2 = vrhaddq_u16(d2, t2);
+ d3 = vrhaddq_u16(d3, t3);
+
+ vst1q_u16(d, d0);
+ d += dst_stride;
+ vst1q_u16(d, d1);
+ d += dst_stride;
+ vst1q_u16(d, d2);
+ d += dst_stride;
+ vst1q_u16(d, d3);
+ d += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
new file mode 100644
index 0000000000..765a054f8d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ (void)bd;
+
+ if (w < 8) { // avg4
+ uint16x4_t s0, s1, d0, d1;
+ uint16x8_t s01, d01;
+ do {
+ s0 = vld1_u16(src);
+ d0 = vld1_u16(dst);
+ src += src_stride;
+ s1 = vld1_u16(src);
+ d1 = vld1_u16(dst + dst_stride);
+ src += src_stride;
+ s01 = vcombine_u16(s0, s1);
+ d01 = vcombine_u16(d0, d1);
+ d01 = vrhaddq_u16(s01, d01);
+ vst1_u16(dst, vget_low_u16(d01));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(d01));
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else if (w == 8) { // avg8
+ uint16x8_t s0, s1, d0, d1;
+ do {
+ s0 = vld1q_u16(src);
+ d0 = vld1q_u16(dst);
+ src += src_stride;
+ s1 = vld1q_u16(src);
+ d1 = vld1q_u16(dst + dst_stride);
+ src += src_stride;
+
+ d0 = vrhaddq_u16(s0, d0);
+ d1 = vrhaddq_u16(s1, d1);
+
+ vst1q_u16(dst, d0);
+ dst += dst_stride;
+ vst1q_u16(dst, d1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else if (w < 32) { // avg16
+ uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h;
+ do {
+ s0l = vld1q_u16(src);
+ s0h = vld1q_u16(src + 8);
+ d0l = vld1q_u16(dst);
+ d0h = vld1q_u16(dst + 8);
+ src += src_stride;
+ s1l = vld1q_u16(src);
+ s1h = vld1q_u16(src + 8);
+ d1l = vld1q_u16(dst + dst_stride);
+ d1h = vld1q_u16(dst + dst_stride + 8);
+ src += src_stride;
+
+ d0l = vrhaddq_u16(s0l, d0l);
+ d0h = vrhaddq_u16(s0h, d0h);
+ d1l = vrhaddq_u16(s1l, d1l);
+ d1h = vrhaddq_u16(s1h, d1h);
+
+ vst1q_u16(dst, d0l);
+ vst1q_u16(dst + 8, d0h);
+ dst += dst_stride;
+ vst1q_u16(dst, d1l);
+ vst1q_u16(dst + 8, d1h);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else if (w == 32) { // avg32
+ uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ d0 = vld1q_u16(dst);
+ d1 = vld1q_u16(dst + 8);
+ d2 = vld1q_u16(dst + 16);
+ d3 = vld1q_u16(dst + 24);
+ src += src_stride;
+
+ d0 = vrhaddq_u16(s0, d0);
+ d1 = vrhaddq_u16(s1, d1);
+ d2 = vrhaddq_u16(s2, d2);
+ d3 = vrhaddq_u16(s3, d3);
+
+ vst1q_u16(dst, d0);
+ vst1q_u16(dst + 8, d1);
+ vst1q_u16(dst + 16, d2);
+ vst1q_u16(dst + 24, d3);
+ dst += dst_stride;
+
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ d0 = vld1q_u16(dst);
+ d1 = vld1q_u16(dst + 8);
+ d2 = vld1q_u16(dst + 16);
+ d3 = vld1q_u16(dst + 24);
+ src += src_stride;
+
+ d0 = vrhaddq_u16(s0, d0);
+ d1 = vrhaddq_u16(s1, d1);
+ d2 = vrhaddq_u16(s2, d2);
+ d3 = vrhaddq_u16(s3, d3);
+
+ vst1q_u16(dst, d0);
+ vst1q_u16(dst + 8, d1);
+ vst1q_u16(dst + 16, d2);
+ vst1q_u16(dst + 24, d3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else { // avg64
+ uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ d0 = vld1q_u16(dst);
+ d1 = vld1q_u16(dst + 8);
+ d2 = vld1q_u16(dst + 16);
+ d3 = vld1q_u16(dst + 24);
+
+ d0 = vrhaddq_u16(s0, d0);
+ d1 = vrhaddq_u16(s1, d1);
+ d2 = vrhaddq_u16(s2, d2);
+ d3 = vrhaddq_u16(s3, d3);
+
+ vst1q_u16(dst, d0);
+ vst1q_u16(dst + 8, d1);
+ vst1q_u16(dst + 16, d2);
+ vst1q_u16(dst + 24, d3);
+
+ s0 = vld1q_u16(src + 32);
+ s1 = vld1q_u16(src + 40);
+ s2 = vld1q_u16(src + 48);
+ s3 = vld1q_u16(src + 56);
+ d0 = vld1q_u16(dst + 32);
+ d1 = vld1q_u16(dst + 40);
+ d2 = vld1q_u16(dst + 48);
+ d3 = vld1q_u16(dst + 56);
+
+ d0 = vrhaddq_u16(s0, d0);
+ d1 = vrhaddq_u16(s1, d1);
+ d2 = vrhaddq_u16(s2, d2);
+ d3 = vrhaddq_u16(s3, d3);
+
+ vst1q_u16(dst + 32, d0);
+ vst1q_u16(dst + 40, d1);
+ vst1q_u16(dst + 48, d2);
+ vst1q_u16(dst + 56, d3);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
new file mode 100644
index 0000000000..7751082083
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+ (void)bd;
+
+ if (w < 8) { // copy4
+ uint16x4_t s0, s1;
+ do {
+ s0 = vld1_u16(src);
+ src += src_stride;
+ s1 = vld1_u16(src);
+ src += src_stride;
+
+ vst1_u16(dst, s0);
+ dst += dst_stride;
+ vst1_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) { // copy8
+ uint16x8_t s0, s1;
+ do {
+ s0 = vld1q_u16(src);
+ src += src_stride;
+ s1 = vld1q_u16(src);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ dst += dst_stride;
+ vst1q_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w < 32) { // copy16
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ src += src_stride;
+ s2 = vld1q_u16(src);
+ s3 = vld1q_u16(src + 8);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ dst += dst_stride;
+ vst1q_u16(dst, s2);
+ vst1q_u16(dst + 8, s3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 32) { // copy32
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ vst1q_u16(dst + 16, s2);
+ vst1q_u16(dst + 24, s3);
+ dst += dst_stride;
+ } while (--h != 0);
+ } else { // copy64
+ uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ s4 = vld1q_u16(src + 32);
+ s5 = vld1q_u16(src + 40);
+ s6 = vld1q_u16(src + 48);
+ s7 = vld1q_u16(src + 56);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ vst1q_u16(dst + 16, s2);
+ vst1q_u16(dst + 24, s3);
+ vst1q_u16(dst + 32, s4);
+ vst1q_u16(dst + 40, s5);
+ vst1q_u16(dst + 48, s6);
+ vst1q_u16(dst + 56, s7);
+ dst += dst_stride;
+ } while (--h != 0);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
new file mode 100644
index 0000000000..414ade3530
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
+ // + 1 to make it divisible by 4
+ uint16_t temp[64 * 136];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ /* Filter starting 3 lines back. The neon implementation will ignore the given
+ * height and filter a multiple of 4 lines. Since this goes in to the temp
+ * buffer which has lots of extra room and is subsequently discarded this is
+ * safe if somewhat less than ideal. */
+ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height, bd);
+
+ /* Step into the temp buffer 3 lines to get the actual frame data */
+ vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ // + 1 to make it divisible by 4
+ uint16_t temp[64 * 136];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ /* This implementation has the same issues as above. In addition, we only want
+ * to average the values after both passes.
+ */
+ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
+ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height, bd);
+ vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
new file mode 100644
index 0000000000..bf5192a683
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a = vld1q_u8(*dest);
+ const uint8x16_t b = vqaddq_u8(a, res);
+ vst1q_u8(*dest, b);
+ *dest += stride;
+}
+
+static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a = vld1q_u8(*dest);
+ const uint8x16_t b = vqsubq_u8(a, res);
+ vst1q_u8(*dest, b);
+ *dest += stride;
+}
+
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+
+ if (a1 >= 0) {
+ const uint8x16_t dc = create_dcq(a1);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ idct16x16_1_add_pos_kernel(&dest, stride, dc);
+ } else {
+ const uint8x16_t dc = create_dcq(-a1);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
new file mode 100644
index 0000000000..fc7f4a7747
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -0,0 +1,764 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
+ int16x4_t *const d1) {
+ *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+}
+
+static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
+ const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int32x4_t *const t32) {
+ t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
+ t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
+ t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x4_t *const d0, int16x4_t *const d1) {
+ int32x4_t t32[2];
+
+ idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+ wrap_low_4x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x4_t *const d0,
+ int16x4_t *const d1) {
+ int32x4_t t32[2];
+
+ idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+ t32[1] = vnegq_s32(t32[1]);
+ wrap_low_4x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x4_t *const d0,
+ int16x4_t *const d1) {
+ int32x4_t t32[3];
+
+ t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
+ t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+ t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+ wrap_low_4x2(t32, d0, d1);
+}
+
+void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
+ void *const dest, const int stride,
+ const int highbd_flag) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
+ const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
+ const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1);
+ int16x8_t in[16], step1[16], step2[16], out[16];
+
+ // Load input (16x8)
+ if (output) {
+ const tran_low_t *inputT = (const tran_low_t *)input;
+ in[0] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[8] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[1] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[9] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[2] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[10] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[3] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[11] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[4] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[12] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[5] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[13] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[6] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[14] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[7] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[15] = load_tran_low_to_s16q(inputT);
+ } else {
+ const int16_t *inputT = (const int16_t *)input;
+ in[0] = vld1q_s16(inputT);
+ inputT += 8;
+ in[8] = vld1q_s16(inputT);
+ inputT += 8;
+ in[1] = vld1q_s16(inputT);
+ inputT += 8;
+ in[9] = vld1q_s16(inputT);
+ inputT += 8;
+ in[2] = vld1q_s16(inputT);
+ inputT += 8;
+ in[10] = vld1q_s16(inputT);
+ inputT += 8;
+ in[3] = vld1q_s16(inputT);
+ inputT += 8;
+ in[11] = vld1q_s16(inputT);
+ inputT += 8;
+ in[4] = vld1q_s16(inputT);
+ inputT += 8;
+ in[12] = vld1q_s16(inputT);
+ inputT += 8;
+ in[5] = vld1q_s16(inputT);
+ inputT += 8;
+ in[13] = vld1q_s16(inputT);
+ inputT += 8;
+ in[6] = vld1q_s16(inputT);
+ inputT += 8;
+ in[14] = vld1q_s16(inputT);
+ inputT += 8;
+ in[7] = vld1q_s16(inputT);
+ inputT += 8;
+ in[15] = vld1q_s16(inputT);
+ }
+
+ // Transpose
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+ transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+ &in[15]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[1] = in[16 / 2];
+ step1[2] = in[8 / 2];
+ step1[3] = in[24 / 2];
+ step1[4] = in[4 / 2];
+ step1[5] = in[20 / 2];
+ step1[6] = in[12 / 2];
+ step1[7] = in[28 / 2];
+ step1[8] = in[2 / 2];
+ step1[9] = in[18 / 2];
+ step1[10] = in[10 / 2];
+ step1[11] = in[26 / 2];
+ step1[12] = in[6 / 2];
+ step1[13] = in[22 / 2];
+ step1[14] = in[14 / 2];
+ step1[15] = in[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+ idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
+ idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
+ &step2[14]);
+ idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+ &step2[13]);
+ idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
+ idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
+ step1[8] = vaddq_s16(step2[8], step2[9]);
+ step1[9] = vsubq_s16(step2[8], step2[9]);
+ step1[10] = vsubq_s16(step2[11], step2[10]);
+ step1[11] = vaddq_s16(step2[11], step2[10]);
+ step1[12] = vaddq_s16(step2[12], step2[13]);
+ step1[13] = vsubq_s16(step2[12], step2[13]);
+ step1[14] = vsubq_s16(step2[15], step2[14]);
+ step1[15] = vaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+ idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
+ idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = vaddq_s16(step2[0], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s16(step2[8], step2[11]);
+ step1[9] = vaddq_s16(step2[9], step2[10]);
+ step1[10] = vsubq_s16(step2[9], step2[10]);
+ step1[11] = vsubq_s16(step2[8], step2[11]);
+ step1[12] = vsubq_s16(step2[15], step2[12]);
+ step1[13] = vsubq_s16(step2[14], step2[13]);
+ step1[14] = vaddq_s16(step2[14], step2[13]);
+ step1[15] = vaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vaddq_s16(step1[0], step1[7]);
+ step2[1] = vaddq_s16(step1[1], step1[6]);
+ step2[2] = vaddq_s16(step1[2], step1[5]);
+ step2[3] = vaddq_s16(step1[3], step1[4]);
+ step2[4] = vsubq_s16(step1[3], step1[4]);
+ step2[5] = vsubq_s16(step1[2], step1[5]);
+ step2[6] = vsubq_s16(step1[1], step1[6]);
+ step2[7] = vsubq_s16(step1[0], step1[7]);
+ idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ idct16x16_add_stage7(step2, out);
+
+ if (output) {
+ idct16x16_store_pass1(out, output);
+ } else {
+ if (highbd_flag) {
+ idct16x16_add_store_bd8(out, dest, stride);
+ } else {
+ idct16x16_add_store(out, dest, stride);
+ }
+ }
+}
+
+void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
+ void *const dest, const int stride,
+ const int highbd_flag) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+ const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+ const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+ const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+ const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+ int16x8_t in[8], step1[16], step2[16], out[16];
+
+ // Load input (8x8)
+ if (output) {
+ const tran_low_t *inputT = (const tran_low_t *)input;
+ in[0] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[1] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[2] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[3] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[4] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[5] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[6] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[7] = load_tran_low_to_s16q(inputT);
+ } else {
+ const int16_t *inputT = (const int16_t *)input;
+ in[0] = vld1q_s16(inputT);
+ inputT += 16;
+ in[1] = vld1q_s16(inputT);
+ inputT += 16;
+ in[2] = vld1q_s16(inputT);
+ inputT += 16;
+ in[3] = vld1q_s16(inputT);
+ inputT += 16;
+ in[4] = vld1q_s16(inputT);
+ inputT += 16;
+ in[5] = vld1q_s16(inputT);
+ inputT += 16;
+ in[6] = vld1q_s16(inputT);
+ inputT += 16;
+ in[7] = vld1q_s16(inputT);
+ }
+
+ // Transpose
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[2] = in[8 / 2];
+ step1[4] = in[4 / 2];
+ step1[6] = in[12 / 2];
+ step1[8] = in[2 / 2];
+ step1[10] = in[10 / 2];
+ step1[12] = in[6 / 2];
+ step1[14] = in[14 / 2]; // 0 in pass 1
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[4] = step1[4];
+ step2[6] = step1[6];
+ step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+ step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3);
+ step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3);
+ step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+ step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+ step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2);
+ step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2);
+ step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+ step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2);
+ step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1);
+ step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+ step1[8] = vaddq_s16(step2[8], step2[9]);
+ step1[9] = vsubq_s16(step2[8], step2[9]);
+ step1[10] = vsubq_s16(step2[11], step2[10]);
+ step1[11] = vaddq_s16(step2[11], step2[10]);
+ step1[12] = vaddq_s16(step2[12], step2[13]);
+ step1[13] = vsubq_s16(step2[12], step2[13]);
+ step1[14] = vsubq_s16(step2[15], step2[14]);
+ step1[15] = vaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+ step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+ step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3);
+ step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1);
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = vaddq_s16(step2[0], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s16(step2[8], step2[11]);
+ step1[9] = vaddq_s16(step2[9], step2[10]);
+ step1[10] = vsubq_s16(step2[9], step2[10]);
+ step1[11] = vsubq_s16(step2[8], step2[11]);
+ step1[12] = vsubq_s16(step2[15], step2[12]);
+ step1[13] = vsubq_s16(step2[14], step2[13]);
+ step1[14] = vaddq_s16(step2[14], step2[13]);
+ step1[15] = vaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vaddq_s16(step1[0], step1[7]);
+ step2[1] = vaddq_s16(step1[1], step1[6]);
+ step2[2] = vaddq_s16(step1[2], step1[5]);
+ step2[3] = vaddq_s16(step1[3], step1[4]);
+ step2[4] = vsubq_s16(step1[3], step1[4]);
+ step2[5] = vsubq_s16(step1[2], step1[5]);
+ step2[6] = vsubq_s16(step1[1], step1[6]);
+ step2[7] = vsubq_s16(step1[0], step1[7]);
+ idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ idct16x16_add_stage7(step2, out);
+
+ if (output) {
+ idct16x16_store_pass1(out, output);
+ } else {
+ if (highbd_flag) {
+ idct16x16_add_store_bd8(out, dest, stride);
+ } else {
+ idct16x16_add_store(out, dest, stride);
+ }
+ }
+}
+
+void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+ int16_t *output) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+ const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+ const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+ const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+ const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+ int16x4_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x4)
+ in[0] = load_tran_low_to_s16d(input);
+ input += 16;
+ in[1] = load_tran_low_to_s16d(input);
+ input += 16;
+ in[2] = load_tran_low_to_s16d(input);
+ input += 16;
+ in[3] = load_tran_low_to_s16d(input);
+
+ // Transpose
+ transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
+ step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+ step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+ step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+ step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vadd_s16(step2[8], step2[11]);
+ step1[9] = vadd_s16(step2[9], step2[10]);
+ step1[10] = vsub_s16(step2[9], step2[10]);
+ step1[11] = vsub_s16(step2[8], step2[11]);
+ step1[12] = vsub_s16(step2[15], step2[12]);
+ step1[13] = vsub_s16(step2[14], step2[13]);
+ step1[14] = vadd_s16(step2[14], step2[13]);
+ step1[15] = vadd_s16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vadd_s16(step1[0], step1[7]);
+ step2[1] = vadd_s16(step1[1], step1[6]);
+ step2[2] = vadd_s16(step1[2], step1[5]);
+ step2[3] = vadd_s16(step1[3], step1[4]);
+ step2[4] = vsub_s16(step1[3], step1[4]);
+ step2[5] = vsub_s16(step1[2], step1[5]);
+ step2[6] = vsub_s16(step1[1], step1[6]);
+ step2[7] = vsub_s16(step1[0], step1[7]);
+ idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ out[0] = vadd_s16(step2[0], step2[15]);
+ out[1] = vadd_s16(step2[1], step2[14]);
+ out[2] = vadd_s16(step2[2], step2[13]);
+ out[3] = vadd_s16(step2[3], step2[12]);
+ out[4] = vadd_s16(step2[4], step2[11]);
+ out[5] = vadd_s16(step2[5], step2[10]);
+ out[6] = vadd_s16(step2[6], step2[9]);
+ out[7] = vadd_s16(step2[7], step2[8]);
+ out[8] = vsub_s16(step2[7], step2[8]);
+ out[9] = vsub_s16(step2[6], step2[9]);
+ out[10] = vsub_s16(step2[5], step2[10]);
+ out[11] = vsub_s16(step2[4], step2[11]);
+ out[12] = vsub_s16(step2[3], step2[12]);
+ out[13] = vsub_s16(step2[2], step2[13]);
+ out[14] = vsub_s16(step2[1], step2[14]);
+ out[15] = vsub_s16(step2[0], step2[15]);
+
+ // pass 1: save the result into output
+ vst1_s16(output, out[0]);
+ output += 4;
+ vst1_s16(output, out[1]);
+ output += 4;
+ vst1_s16(output, out[2]);
+ output += 4;
+ vst1_s16(output, out[3]);
+ output += 4;
+ vst1_s16(output, out[4]);
+ output += 4;
+ vst1_s16(output, out[5]);
+ output += 4;
+ vst1_s16(output, out[6]);
+ output += 4;
+ vst1_s16(output, out[7]);
+ output += 4;
+ vst1_s16(output, out[8]);
+ output += 4;
+ vst1_s16(output, out[9]);
+ output += 4;
+ vst1_s16(output, out[10]);
+ output += 4;
+ vst1_s16(output, out[11]);
+ output += 4;
+ vst1_s16(output, out[12]);
+ output += 4;
+ vst1_s16(output, out[13]);
+ output += 4;
+ vst1_s16(output, out[14]);
+ output += 4;
+ vst1_s16(output, out[15]);
+}
+
+void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
+ int16_t *const output, void *const dest,
+ const int stride,
+ const int highbd_flag) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+ const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+ const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+ const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+ const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+ int16x4_t ind[8];
+ int16x8_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x8)
+ ind[0] = vld1_s16(input);
+ input += 4;
+ ind[1] = vld1_s16(input);
+ input += 4;
+ ind[2] = vld1_s16(input);
+ input += 4;
+ ind[3] = vld1_s16(input);
+ input += 4;
+ ind[4] = vld1_s16(input);
+ input += 4;
+ ind[5] = vld1_s16(input);
+ input += 4;
+ ind[6] = vld1_s16(input);
+ input += 4;
+ ind[7] = vld1_s16(input);
+
+ // Transpose
+ transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
+ ind[7], &in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+ step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+ step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+ step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+ step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s16(step2[8], step2[11]);
+ step1[9] = vaddq_s16(step2[9], step2[10]);
+ step1[10] = vsubq_s16(step2[9], step2[10]);
+ step1[11] = vsubq_s16(step2[8], step2[11]);
+ step1[12] = vsubq_s16(step2[15], step2[12]);
+ step1[13] = vsubq_s16(step2[14], step2[13]);
+ step1[14] = vaddq_s16(step2[14], step2[13]);
+ step1[15] = vaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vaddq_s16(step1[0], step1[7]);
+ step2[1] = vaddq_s16(step1[1], step1[6]);
+ step2[2] = vaddq_s16(step1[2], step1[5]);
+ step2[3] = vaddq_s16(step1[3], step1[4]);
+ step2[4] = vsubq_s16(step1[3], step1[4]);
+ step2[5] = vsubq_s16(step1[2], step1[5]);
+ step2[6] = vsubq_s16(step1[1], step1[6]);
+ step2[7] = vsubq_s16(step1[0], step1[7]);
+ idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ idct16x16_add_stage7(step2, out);
+
+ if (output) {
+ idct16x16_store_pass1(out, output);
+ } else {
+ if (highbd_flag) {
+ idct16x16_add_store_bd8(out, dest, stride);
+ } else {
+ idct16x16_add_store(out, dest, stride);
+ }
+ }
+}
+
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0);
+
+ // Parallel idct on the lower 8 rows
+ vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
+ stride, 0);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
+ 0);
+}
+
+void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
+ 0);
+}
+
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16_t row_idct_output[4 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0);
+
+ // Parallel idct to get the right 8 columns
+ vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+ stride, 0);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
new file mode 100644
index 0000000000..057731ad92
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -0,0 +1,674 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_8x8_s16(const tran_low_t *input, int16x8_t *const in0,
+ int16x8_t *const in1, int16x8_t *const in2,
+ int16x8_t *const in3, int16x8_t *const in4,
+ int16x8_t *const in5, int16x8_t *const in6,
+ int16x8_t *const in7) {
+ *in0 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in1 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in2 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in3 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in4 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in5 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in6 = load_tran_low_to_s16q(input);
+ input += 32;
+ *in7 = load_tran_low_to_s16q(input);
+}
+
+static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0,
+ int16x4_t *const in1, int16x4_t *const in2,
+ int16x4_t *const in3, int16x4_t *const in4,
+ int16x4_t *const in5, int16x4_t *const in6,
+ int16x4_t *const in7) {
+ *in0 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in1 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in2 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in3 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in4 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in5 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in6 = load_tran_low_to_s16d(input);
+ input += 32;
+ *in7 = load_tran_low_to_s16d(input);
+}
+
+// Only for the first pass of the _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// 0 0 2 5 10 17 25 38 47 62 83 101 121
+// 1 1 4 8 15 22 30 45 58 74 92 112 133
+// 2 3 7 12 18 28 36 52 64 82 102 118
+// 3 6 11 16 23 31 43 60 73 90 109 126
+// 4 9 14 19 29 37 50 65 78 98 116 134
+// 5 13 20 26 35 44 54 72 85 105 123
+// 6 21 27 33 42 53 63 80 94 113 132
+// 7 24 32 39 48 57 71 88 104 120
+// 8 34 40 46 56 68 81 96 111 130
+// 9 41 49 55 67 77 91 107 124
+// 10 51 59 66 76 89 99 119 131
+// 11 61 69 75 87 100 114 129
+// 12 70 79 86 97 108 122
+// 13 84 93 103 110 125
+// 14 98 106 115 127
+// 15 117 128
+void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output) {
+ int16x4_t tmp[8];
+ int16x8_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32];
+
+ load_8x8_s16(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ load_4x8_s16(input + 8, &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5],
+ &tmp[6], &tmp[7]);
+ transpose_s16_4x8(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6],
+ tmp[7], &in[8], &in[9], &in[10], &in[11]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+ s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64);
+ s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64);
+
+ s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+ s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64);
+ s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64);
+
+ s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+ s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64);
+ s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64);
+
+ s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
+
+ s2[18] = vsubq_s16(s1[19], s1[18]);
+ s2[19] = vaddq_s16(s1[18], s1[19]);
+ s2[20] = vaddq_s16(s1[20], s1[21]);
+ s2[21] = vsubq_s16(s1[20], s1[21]);
+ s2[26] = vsubq_s16(s1[27], s1[26]);
+ s2[27] = vaddq_s16(s1[26], s1[27]);
+ s2[28] = vaddq_s16(s1[28], s1[29]);
+ s2[29] = vsubq_s16(s1[28], s1[29]);
+
+ // stage 3
+ s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+ s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+ s3[10] = vsubq_s16(s2[11], s2[10]);
+ s3[11] = vaddq_s16(s2[10], s2[11]);
+ s3[12] = vaddq_s16(s2[12], s2[13]);
+ s3[13] = vsubq_s16(s2[12], s2[13]);
+
+ s3[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+ cospi_28_64);
+ s3[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+ cospi_4_64);
+
+ s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64,
+ s2[29], -cospi_4_64);
+ s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29],
+ cospi_28_64);
+
+ s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64,
+ s2[26], cospi_12_64);
+ s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26],
+ cospi_20_64);
+
+ s3[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s3[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+ s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64);
+ s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64);
+
+ s4[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+ cospi_24_64);
+ s4[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+ cospi_8_64);
+
+ s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64,
+ s3[13], -cospi_8_64);
+ s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13],
+ cospi_24_64);
+
+ s4[16] = vaddq_s16(s1[16], s2[19]);
+ s4[17] = vaddq_s16(s3[17], s3[18]);
+ s4[18] = vsubq_s16(s3[17], s3[18]);
+ s4[19] = vsubq_s16(s1[16], s2[19]);
+ s4[20] = vsubq_s16(s1[23], s2[20]);
+ s4[21] = vsubq_s16(s3[22], s3[21]);
+ s4[22] = vaddq_s16(s3[21], s3[22]);
+ s4[23] = vaddq_s16(s2[20], s1[23]);
+ s4[24] = vaddq_s16(s1[24], s2[27]);
+ s4[25] = vaddq_s16(s3[25], s3[26]);
+ s4[26] = vsubq_s16(s3[25], s3[26]);
+ s4[27] = vsubq_s16(s1[24], s2[27]);
+ s4[28] = vsubq_s16(s1[31], s2[28]);
+ s4[29] = vsubq_s16(s3[30], s3[29]);
+ s4[30] = vaddq_s16(s3[29], s3[30]);
+ s4[31] = vaddq_s16(s2[28], s1[31]);
+
+ // stage 5
+ s5[0] = vaddq_s16(s4[0], s4[3]);
+ s5[1] = vaddq_s16(s4[0], s4[2]);
+ s5[2] = vsubq_s16(s4[0], s4[2]);
+ s5[3] = vsubq_s16(s4[0], s4[3]);
+
+ s5[5] = sub_multiply_shift_and_narrow_s16(s3[7], s3[4], cospi_16_64);
+ s5[6] = add_multiply_shift_and_narrow_s16(s3[4], s3[7], cospi_16_64);
+
+ s5[8] = vaddq_s16(s2[8], s3[11]);
+ s5[9] = vaddq_s16(s4[9], s4[10]);
+ s5[10] = vsubq_s16(s4[9], s4[10]);
+ s5[11] = vsubq_s16(s2[8], s3[11]);
+ s5[12] = vsubq_s16(s2[15], s3[12]);
+ s5[13] = vsubq_s16(s4[14], s4[13]);
+ s5[14] = vaddq_s16(s4[13], s4[14]);
+ s5[15] = vaddq_s16(s2[15], s3[12]);
+
+ s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29],
+ cospi_24_64);
+ s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29],
+ cospi_8_64);
+
+ s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28],
+ cospi_24_64);
+ s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28],
+ cospi_8_64);
+
+ s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64,
+ s4[27], -cospi_8_64);
+ s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27],
+ cospi_24_64);
+
+ s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64,
+ s4[26], -cospi_8_64);
+ s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26],
+ cospi_24_64);
+
+ // stage 6
+ s6[0] = vaddq_s16(s5[0], s3[7]);
+ s6[1] = vaddq_s16(s5[1], s5[6]);
+ s6[2] = vaddq_s16(s5[2], s5[5]);
+ s6[3] = vaddq_s16(s5[3], s3[4]);
+ s6[4] = vsubq_s16(s5[3], s3[4]);
+ s6[5] = vsubq_s16(s5[2], s5[5]);
+ s6[6] = vsubq_s16(s5[1], s5[6]);
+ s6[7] = vsubq_s16(s5[0], s3[7]);
+
+ s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64);
+ s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64);
+
+ s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64);
+ s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64);
+
+ s6[16] = vaddq_s16(s4[16], s4[23]);
+ s6[17] = vaddq_s16(s4[17], s4[22]);
+ s6[18] = vaddq_s16(s5[18], s5[21]);
+ s6[19] = vaddq_s16(s5[19], s5[20]);
+ s6[20] = vsubq_s16(s5[19], s5[20]);
+ s6[21] = vsubq_s16(s5[18], s5[21]);
+ s6[22] = vsubq_s16(s4[17], s4[22]);
+ s6[23] = vsubq_s16(s4[16], s4[23]);
+
+ s6[24] = vsubq_s16(s4[31], s4[24]);
+ s6[25] = vsubq_s16(s4[30], s4[25]);
+ s6[26] = vsubq_s16(s5[29], s5[26]);
+ s6[27] = vsubq_s16(s5[28], s5[27]);
+ s6[28] = vaddq_s16(s5[27], s5[28]);
+ s6[29] = vaddq_s16(s5[26], s5[29]);
+ s6[30] = vaddq_s16(s4[25], s4[30]);
+ s6[31] = vaddq_s16(s4[24], s4[31]);
+
+ // stage 7
+ s7[0] = vaddq_s16(s6[0], s5[15]);
+ s7[1] = vaddq_s16(s6[1], s5[14]);
+ s7[2] = vaddq_s16(s6[2], s6[13]);
+ s7[3] = vaddq_s16(s6[3], s6[12]);
+ s7[4] = vaddq_s16(s6[4], s6[11]);
+ s7[5] = vaddq_s16(s6[5], s6[10]);
+ s7[6] = vaddq_s16(s6[6], s5[9]);
+ s7[7] = vaddq_s16(s6[7], s5[8]);
+ s7[8] = vsubq_s16(s6[7], s5[8]);
+ s7[9] = vsubq_s16(s6[6], s5[9]);
+ s7[10] = vsubq_s16(s6[5], s6[10]);
+ s7[11] = vsubq_s16(s6[4], s6[11]);
+ s7[12] = vsubq_s16(s6[3], s6[12]);
+ s7[13] = vsubq_s16(s6[2], s6[13]);
+ s7[14] = vsubq_s16(s6[1], s5[14]);
+ s7[15] = vsubq_s16(s6[0], s5[15]);
+
+ s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64);
+ s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64);
+
+ s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64);
+ s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64);
+
+ s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64);
+ s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64);
+
+ s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64);
+ s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
+
+ // final stage
+ vst1q_s16(output, vaddq_s16(s7[0], s6[31]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[1], s6[30]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[2], s6[29]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[3], s6[28]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[4], s7[27]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[5], s7[26]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[6], s7[25]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[7], s7[24]));
+ output += 16;
+
+ vst1q_s16(output, vaddq_s16(s7[8], s7[23]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[9], s7[22]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[10], s7[21]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[11], s7[20]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[12], s6[19]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[13], s6[18]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[14], s6[17]));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7[15], s6[16]));
+ output += 16;
+
+ vst1q_s16(output, vsubq_s16(s7[15], s6[16]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[14], s6[17]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[13], s6[18]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[12], s6[19]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[11], s7[20]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[10], s7[21]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[9], s7[22]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[8], s7[23]));
+ output += 16;
+
+ vst1q_s16(output, vsubq_s16(s7[7], s7[24]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[6], s7[25]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[5], s7[26]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[4], s7[27]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[3], s6[28]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[2], s6[29]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[1], s6[30]));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7[0], s6[31]));
+}
+
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
+ const int stride, const int highbd_flag) {
+ int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+ out[32];
+
+ load_and_transpose_s16_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
+ &in[5], &in[6], &in[7]);
+
+ load_and_transpose_s16_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
+ &in[12], &in[13], &in[14], &in[15]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+ s1[17] = multiply_shift_and_narrow_s16(in[15], -cospi_17_64);
+ s1[30] = multiply_shift_and_narrow_s16(in[15], cospi_15_64);
+
+ s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64);
+ s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64);
+
+ s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+ s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64);
+ s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64);
+
+ s1[22] = multiply_shift_and_narrow_s16(in[13], cospi_19_64);
+ s1[25] = multiply_shift_and_narrow_s16(in[13], cospi_13_64);
+
+ s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+ s2[9] = multiply_shift_and_narrow_s16(in[14], -cospi_18_64);
+ s2[14] = multiply_shift_and_narrow_s16(in[14], cospi_14_64);
+
+ s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64);
+ s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64);
+
+ s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
+
+ s2[16] = vaddq_s16(s1[16], s1[17]);
+ s2[17] = vsubq_s16(s1[16], s1[17]);
+ s2[18] = vsubq_s16(s1[19], s1[18]);
+ s2[19] = vaddq_s16(s1[18], s1[19]);
+ s2[20] = vaddq_s16(s1[20], s1[21]);
+ s2[21] = vsubq_s16(s1[20], s1[21]);
+ s2[22] = vsubq_s16(s1[23], s1[22]);
+ s2[23] = vaddq_s16(s1[22], s1[23]);
+ s2[24] = vaddq_s16(s1[24], s1[25]);
+ s2[25] = vsubq_s16(s1[24], s1[25]);
+ s2[26] = vsubq_s16(s1[27], s1[26]);
+ s2[27] = vaddq_s16(s1[26], s1[27]);
+ s2[28] = vaddq_s16(s1[28], s1[29]);
+ s2[29] = vsubq_s16(s1[28], s1[29]);
+ s2[30] = vsubq_s16(s1[31], s1[30]);
+ s2[31] = vaddq_s16(s1[30], s1[31]);
+
+ // stage 3
+ s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+ s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+ s3[5] = multiply_shift_and_narrow_s16(in[12], -cospi_20_64);
+ s3[6] = multiply_shift_and_narrow_s16(in[12], cospi_12_64);
+
+ s3[8] = vaddq_s16(s2[8], s2[9]);
+ s3[9] = vsubq_s16(s2[8], s2[9]);
+ s3[10] = vsubq_s16(s2[11], s2[10]);
+ s3[11] = vaddq_s16(s2[10], s2[11]);
+ s3[12] = vaddq_s16(s2[12], s2[13]);
+ s3[13] = vsubq_s16(s2[12], s2[13]);
+ s3[14] = vsubq_s16(s2[15], s2[14]);
+ s3[15] = vaddq_s16(s2[14], s2[15]);
+
+ s3[17] = multiply_accumulate_shift_and_narrow_s16(s2[17], -cospi_4_64, s2[30],
+ cospi_28_64);
+ s3[30] = multiply_accumulate_shift_and_narrow_s16(s2[17], cospi_28_64, s2[30],
+ cospi_4_64);
+
+ s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64,
+ s2[29], -cospi_4_64);
+ s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29],
+ cospi_28_64);
+
+ s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64,
+ s2[26], cospi_12_64);
+ s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26],
+ cospi_20_64);
+
+ s3[22] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_12_64,
+ s2[25], -cospi_20_64);
+ s3[25] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_20_64,
+ s2[25], cospi_12_64);
+
+ // stage 4
+ s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+ s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64);
+ s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64);
+
+ s4[4] = vaddq_s16(s3[4], s3[5]);
+ s4[5] = vsubq_s16(s3[4], s3[5]);
+ s4[6] = vsubq_s16(s3[7], s3[6]);
+ s4[7] = vaddq_s16(s3[6], s3[7]);
+
+ s4[9] = multiply_accumulate_shift_and_narrow_s16(s3[9], -cospi_8_64, s3[14],
+ cospi_24_64);
+ s4[14] = multiply_accumulate_shift_and_narrow_s16(s3[9], cospi_24_64, s3[14],
+ cospi_8_64);
+
+ s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64,
+ s3[13], -cospi_8_64);
+ s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13],
+ cospi_24_64);
+
+ s4[16] = vaddq_s16(s2[16], s2[19]);
+ s4[17] = vaddq_s16(s3[17], s3[18]);
+ s4[18] = vsubq_s16(s3[17], s3[18]);
+ s4[19] = vsubq_s16(s2[16], s2[19]);
+ s4[20] = vsubq_s16(s2[23], s2[20]);
+ s4[21] = vsubq_s16(s3[22], s3[21]);
+ s4[22] = vaddq_s16(s3[21], s3[22]);
+ s4[23] = vaddq_s16(s2[20], s2[23]);
+ s4[24] = vaddq_s16(s2[24], s2[27]);
+ s4[25] = vaddq_s16(s3[25], s3[26]);
+ s4[26] = vsubq_s16(s3[25], s3[26]);
+ s4[27] = vsubq_s16(s2[24], s2[27]);
+ s4[28] = vsubq_s16(s2[31], s2[28]);
+ s4[29] = vsubq_s16(s3[30], s3[29]);
+ s4[30] = vaddq_s16(s3[29], s3[30]);
+ s4[31] = vaddq_s16(s2[28], s2[31]);
+
+ // stage 5
+ s5[0] = vaddq_s16(s4[0], s4[3]);
+ s5[1] = vaddq_s16(s4[0], s4[2]);
+ s5[2] = vsubq_s16(s4[0], s4[2]);
+ s5[3] = vsubq_s16(s4[0], s4[3]);
+
+ s5[5] = sub_multiply_shift_and_narrow_s16(s4[6], s4[5], cospi_16_64);
+ s5[6] = add_multiply_shift_and_narrow_s16(s4[5], s4[6], cospi_16_64);
+
+ s5[8] = vaddq_s16(s3[8], s3[11]);
+ s5[9] = vaddq_s16(s4[9], s4[10]);
+ s5[10] = vsubq_s16(s4[9], s4[10]);
+ s5[11] = vsubq_s16(s3[8], s3[11]);
+ s5[12] = vsubq_s16(s3[15], s3[12]);
+ s5[13] = vsubq_s16(s4[14], s4[13]);
+ s5[14] = vaddq_s16(s4[13], s4[14]);
+ s5[15] = vaddq_s16(s3[15], s3[12]);
+
+ s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29],
+ cospi_24_64);
+ s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29],
+ cospi_8_64);
+
+ s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28],
+ cospi_24_64);
+ s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28],
+ cospi_8_64);
+
+ s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64,
+ s4[27], -cospi_8_64);
+ s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27],
+ cospi_24_64);
+
+ s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64,
+ s4[26], -cospi_8_64);
+ s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26],
+ cospi_24_64);
+
+ // stage 6
+ s6[0] = vaddq_s16(s5[0], s4[7]);
+ s6[1] = vaddq_s16(s5[1], s5[6]);
+ s6[2] = vaddq_s16(s5[2], s5[5]);
+ s6[3] = vaddq_s16(s5[3], s4[4]);
+ s6[4] = vsubq_s16(s5[3], s4[4]);
+ s6[5] = vsubq_s16(s5[2], s5[5]);
+ s6[6] = vsubq_s16(s5[1], s5[6]);
+ s6[7] = vsubq_s16(s5[0], s4[7]);
+
+ s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64);
+ s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64);
+
+ s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64);
+ s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64);
+
+ s6[16] = vaddq_s16(s4[16], s4[23]);
+ s6[17] = vaddq_s16(s4[17], s4[22]);
+ s6[18] = vaddq_s16(s5[18], s5[21]);
+ s6[19] = vaddq_s16(s5[19], s5[20]);
+ s6[20] = vsubq_s16(s5[19], s5[20]);
+ s6[21] = vsubq_s16(s5[18], s5[21]);
+ s6[22] = vsubq_s16(s4[17], s4[22]);
+ s6[23] = vsubq_s16(s4[16], s4[23]);
+ s6[24] = vsubq_s16(s4[31], s4[24]);
+ s6[25] = vsubq_s16(s4[30], s4[25]);
+ s6[26] = vsubq_s16(s5[29], s5[26]);
+ s6[27] = vsubq_s16(s5[28], s5[27]);
+ s6[28] = vaddq_s16(s5[27], s5[28]);
+ s6[29] = vaddq_s16(s5[26], s5[29]);
+ s6[30] = vaddq_s16(s4[25], s4[30]);
+ s6[31] = vaddq_s16(s4[24], s4[31]);
+
+ // stage 7
+ s7[0] = vaddq_s16(s6[0], s5[15]);
+ s7[1] = vaddq_s16(s6[1], s5[14]);
+ s7[2] = vaddq_s16(s6[2], s6[13]);
+ s7[3] = vaddq_s16(s6[3], s6[12]);
+ s7[4] = vaddq_s16(s6[4], s6[11]);
+ s7[5] = vaddq_s16(s6[5], s6[10]);
+ s7[6] = vaddq_s16(s6[6], s5[9]);
+ s7[7] = vaddq_s16(s6[7], s5[8]);
+ s7[8] = vsubq_s16(s6[7], s5[8]);
+ s7[9] = vsubq_s16(s6[6], s5[9]);
+ s7[10] = vsubq_s16(s6[5], s6[10]);
+ s7[11] = vsubq_s16(s6[4], s6[11]);
+ s7[12] = vsubq_s16(s6[3], s6[12]);
+ s7[13] = vsubq_s16(s6[2], s6[13]);
+ s7[14] = vsubq_s16(s6[1], s5[14]);
+ s7[15] = vsubq_s16(s6[0], s5[15]);
+
+ s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64);
+ s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64);
+
+ s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64);
+ s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64);
+
+ s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64);
+ s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64);
+
+ s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64);
+ s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
+
+ // final stage
+ out[0] = final_add(s7[0], s6[31]);
+ out[1] = final_add(s7[1], s6[30]);
+ out[2] = final_add(s7[2], s6[29]);
+ out[3] = final_add(s7[3], s6[28]);
+ out[4] = final_add(s7[4], s7[27]);
+ out[5] = final_add(s7[5], s7[26]);
+ out[6] = final_add(s7[6], s7[25]);
+ out[7] = final_add(s7[7], s7[24]);
+ out[8] = final_add(s7[8], s7[23]);
+ out[9] = final_add(s7[9], s7[22]);
+ out[10] = final_add(s7[10], s7[21]);
+ out[11] = final_add(s7[11], s7[20]);
+ out[12] = final_add(s7[12], s6[19]);
+ out[13] = final_add(s7[13], s6[18]);
+ out[14] = final_add(s7[14], s6[17]);
+ out[15] = final_add(s7[15], s6[16]);
+ out[16] = final_sub(s7[15], s6[16]);
+ out[17] = final_sub(s7[14], s6[17]);
+ out[18] = final_sub(s7[13], s6[18]);
+ out[19] = final_sub(s7[12], s6[19]);
+ out[20] = final_sub(s7[11], s7[20]);
+ out[21] = final_sub(s7[10], s7[21]);
+ out[22] = final_sub(s7[9], s7[22]);
+ out[23] = final_sub(s7[8], s7[23]);
+ out[24] = final_sub(s7[7], s7[24]);
+ out[25] = final_sub(s7[6], s7[25]);
+ out[26] = final_sub(s7[5], s7[26]);
+ out[27] = final_sub(s7[4], s7[27]);
+ out[28] = final_sub(s7[3], s6[28]);
+ out[29] = final_sub(s7[2], s6[29]);
+ out[30] = final_sub(s7[1], s6[30]);
+ out[31] = final_sub(s7[0], s6[31]);
+
+ if (highbd_flag) {
+ highbd_add_and_store_bd8(out, output, stride);
+ } else {
+ uint8_t *const outputT = (uint8_t *)output;
+ add_and_store_u8_s16(out + 0, outputT, stride);
+ add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
+ add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
+ add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
+ }
+}
+
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ int16_t temp[32 * 16];
+ int16_t *t = temp;
+
+ vpx_idct32_12_neon(input, temp);
+ vpx_idct32_12_neon(input + 32 * 8, temp + 8);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_idct32_16_neon(t, dest, stride, 0);
+ t += (16 * 8);
+ dest += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
new file mode 100644
index 0000000000..8920b93363
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a0 = vld1q_u8(*dest);
+ const uint8x16_t a1 = vld1q_u8(*dest + 16);
+ const uint8x16_t b0 = vqaddq_u8(a0, res);
+ const uint8x16_t b1 = vqaddq_u8(a1, res);
+ vst1q_u8(*dest, b0);
+ vst1q_u8(*dest + 16, b1);
+ *dest += stride;
+}
+
+static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x16_t res) {
+ const uint8x16_t a0 = vld1q_u8(*dest);
+ const uint8x16_t a1 = vld1q_u8(*dest + 16);
+ const uint8x16_t b0 = vqsubq_u8(a0, res);
+ const uint8x16_t b1 = vqsubq_u8(a1, res);
+ vst1q_u8(*dest, b0);
+ vst1q_u8(*dest + 16, b1);
+ *dest += stride;
+}
+
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+
+ if (a1 >= 0) {
+ const uint8x16_t dc = create_dcq(a1);
+ for (i = 0; i < 32; i++) {
+ idct32x32_1_add_pos_kernel(&dest, stride, dc);
+ }
+ } else {
+ const uint8x16_t dc = create_dcq(-a1);
+ for (i = 0; i < 32; i++) {
+ idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
new file mode 100644
index 0000000000..f570547e44
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the _34_ variant. Since it only uses values from
+// the top left 8x8 it can safely assume all the remaining values are 0 and skip
+// an awful lot of calculations. In fact, only the first 6 columns make the cut.
+// None of the elements in the 7th or 8th column are used so it skips any calls
+// to input[67] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 8x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7
+// 0 0 2 5 10 17 25
+// 1 1 4 8 15 22 30
+// 2 3 7 12 18 28
+// 3 6 11 16 23 31
+// 4 9 14 19 29
+// 5 13 20 26
+// 6 21 27 33
+// 7 24 32
+void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) {
+ int16x8_t in[8], s1[32], s2[32], s3[32];
+
+ in[0] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[1] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[2] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[3] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[4] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[5] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[6] = load_tran_low_to_s16q(input);
+ input += 32;
+ in[7] = load_tran_low_to_s16q(input);
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ // stage 1
+ // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
+ s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+ // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
+ s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+ s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+ s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+ // stage 3
+ s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+ s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+ s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+ cospi_28_64);
+ s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+ cospi_4_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
+ s1[27], cospi_12_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
+ cospi_20_64);
+
+ s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+
+ s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+ cospi_24_64);
+ s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+ cospi_8_64);
+
+ s2[20] = vsubq_s16(s1[23], s1[20]);
+ s2[21] = vsubq_s16(s1[22], s1[21]);
+ s2[22] = vaddq_s16(s1[21], s1[22]);
+ s2[23] = vaddq_s16(s1[20], s1[23]);
+ s2[24] = vaddq_s16(s1[24], s1[27]);
+ s2[25] = vaddq_s16(s1[25], s1[26]);
+ s2[26] = vsubq_s16(s1[25], s1[26]);
+ s2[27] = vsubq_s16(s1[24], s1[27]);
+
+ // stage 5
+ s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
+ s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
+
+ s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30],
+ cospi_24_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30],
+ cospi_8_64);
+
+ s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31],
+ cospi_24_64);
+ s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31],
+ cospi_8_64);
+
+ s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
+ s2[27], -cospi_8_64);
+ s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
+ cospi_24_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
+ s2[26], -cospi_8_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
+ cospi_24_64);
+
+ // stage 6
+ s2[0] = vaddq_s16(s1[0], s1[7]);
+ s2[1] = vaddq_s16(s1[0], s1[6]);
+ s2[2] = vaddq_s16(s1[0], s1[5]);
+ s2[3] = vaddq_s16(s1[0], s1[4]);
+ s2[4] = vsubq_s16(s1[0], s1[4]);
+ s2[5] = vsubq_s16(s1[0], s1[5]);
+ s2[6] = vsubq_s16(s1[0], s1[6]);
+ s2[7] = vsubq_s16(s1[0], s1[7]);
+
+ s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64);
+ s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64);
+
+ s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64);
+ s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64);
+
+ s2[16] = vaddq_s16(s1[16], s2[23]);
+ s2[17] = vaddq_s16(s1[17], s2[22]);
+ s2[18] = vaddq_s16(s1[18], s1[21]);
+ s2[19] = vaddq_s16(s1[19], s1[20]);
+ s2[20] = vsubq_s16(s1[19], s1[20]);
+ s2[21] = vsubq_s16(s1[18], s1[21]);
+ s2[22] = vsubq_s16(s1[17], s2[22]);
+ s2[23] = vsubq_s16(s1[16], s2[23]);
+
+ s3[24] = vsubq_s16(s1[31], s2[24]);
+ s3[25] = vsubq_s16(s1[30], s2[25]);
+ s3[26] = vsubq_s16(s1[29], s1[26]);
+ s3[27] = vsubq_s16(s1[28], s1[27]);
+ s2[28] = vaddq_s16(s1[27], s1[28]);
+ s2[29] = vaddq_s16(s1[26], s1[29]);
+ s2[30] = vaddq_s16(s2[25], s1[30]);
+ s2[31] = vaddq_s16(s2[24], s1[31]);
+
+ // stage 7
+ s1[0] = vaddq_s16(s2[0], s2[15]);
+ s1[1] = vaddq_s16(s2[1], s2[14]);
+ s1[2] = vaddq_s16(s2[2], s2[13]);
+ s1[3] = vaddq_s16(s2[3], s2[12]);
+ s1[4] = vaddq_s16(s2[4], s2[11]);
+ s1[5] = vaddq_s16(s2[5], s2[10]);
+ s1[6] = vaddq_s16(s2[6], s2[9]);
+ s1[7] = vaddq_s16(s2[7], s2[8]);
+ s1[8] = vsubq_s16(s2[7], s2[8]);
+ s1[9] = vsubq_s16(s2[6], s2[9]);
+ s1[10] = vsubq_s16(s2[5], s2[10]);
+ s1[11] = vsubq_s16(s2[4], s2[11]);
+ s1[12] = vsubq_s16(s2[3], s2[12]);
+ s1[13] = vsubq_s16(s2[2], s2[13]);
+ s1[14] = vsubq_s16(s2[1], s2[14]);
+ s1[15] = vsubq_s16(s2[0], s2[15]);
+
+ s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
+ s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
+
+ s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
+ s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
+
+ s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64);
+ s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64);
+
+ s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64);
+ s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64);
+
+ // final stage
+ vst1q_s16(output, vaddq_s16(s1[0], s2[31]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[1], s2[30]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[2], s2[29]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[3], s2[28]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[4], s1[27]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[5], s1[26]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[6], s1[25]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[7], s1[24]));
+ output += 8;
+
+ vst1q_s16(output, vaddq_s16(s1[8], s1[23]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[9], s1[22]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[10], s1[21]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[11], s1[20]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[12], s2[19]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[13], s2[18]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[14], s2[17]));
+ output += 8;
+ vst1q_s16(output, vaddq_s16(s1[15], s2[16]));
+ output += 8;
+
+ vst1q_s16(output, vsubq_s16(s1[15], s2[16]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[14], s2[17]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[13], s2[18]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[12], s2[19]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[11], s1[20]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[10], s1[21]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[9], s1[22]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[8], s1[23]));
+ output += 8;
+
+ vst1q_s16(output, vsubq_s16(s1[7], s1[24]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[6], s1[25]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[5], s1[26]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[4], s1[27]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[3], s2[28]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[2], s2[29]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[1], s2[30]));
+ output += 8;
+ vst1q_s16(output, vsubq_s16(s1[0], s2[31]));
+}
+
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
+ const int highbd_flag) {
+ int16x8_t in[8], s1[32], s2[32], s3[32], out[32];
+
+ load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
+ &in[5], &in[6], &in[7]);
+
+ // stage 1
+ s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+ s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+ // Different for _8_
+ s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+ s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
+
+ s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+ s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+ s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+ s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+ // stage 2
+ s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+ s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+ s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+ s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
+
+ // stage 3
+ s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+ s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+ s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+ cospi_28_64);
+ s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+ cospi_4_64);
+
+ // Different for _8_
+ s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64,
+ s1[28], -cospi_4_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28],
+ cospi_28_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
+ s1[27], cospi_12_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
+ cospi_20_64);
+
+ s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+ s1[24], -cospi_20_64);
+ s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+ s1[24], cospi_12_64);
+
+ // stage 4
+ s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+
+ s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+ cospi_24_64);
+ s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+ cospi_8_64);
+
+ s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64,
+ s2[12], -cospi_8_64);
+ s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12],
+ cospi_24_64);
+
+ s2[16] = vaddq_s16(s1[16], s1[19]);
+
+ s2[17] = vaddq_s16(s1[17], s1[18]);
+ s2[18] = vsubq_s16(s1[17], s1[18]);
+
+ s2[19] = vsubq_s16(s1[16], s1[19]);
+
+ s2[20] = vsubq_s16(s1[23], s1[20]);
+ s2[21] = vsubq_s16(s1[22], s1[21]);
+
+ s2[22] = vaddq_s16(s1[21], s1[22]);
+ s2[23] = vaddq_s16(s1[20], s1[23]);
+
+ s2[24] = vaddq_s16(s1[24], s1[27]);
+ s2[25] = vaddq_s16(s1[25], s1[26]);
+ s2[26] = vsubq_s16(s1[25], s1[26]);
+ s2[27] = vsubq_s16(s1[24], s1[27]);
+
+ s2[28] = vsubq_s16(s1[31], s1[28]);
+ s2[29] = vsubq_s16(s1[30], s1[29]);
+ s2[30] = vaddq_s16(s1[29], s1[30]);
+ s2[31] = vaddq_s16(s1[28], s1[31]);
+
+ // stage 5
+ s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
+ s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
+
+ s1[8] = vaddq_s16(s2[8], s2[11]);
+ s1[9] = vaddq_s16(s2[9], s2[10]);
+ s1[10] = vsubq_s16(s2[9], s2[10]);
+ s1[11] = vsubq_s16(s2[8], s2[11]);
+ s1[12] = vsubq_s16(s2[15], s2[12]);
+ s1[13] = vsubq_s16(s2[14], s2[13]);
+ s1[14] = vaddq_s16(s2[13], s2[14]);
+ s1[15] = vaddq_s16(s2[12], s2[15]);
+
+ s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29],
+ cospi_24_64);
+ s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29],
+ cospi_8_64);
+
+ s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28],
+ cospi_24_64);
+ s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28],
+ cospi_8_64);
+
+ s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
+ s2[27], -cospi_8_64);
+ s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
+ cospi_24_64);
+
+ s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
+ s2[26], -cospi_8_64);
+ s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
+ cospi_24_64);
+
+ // stage 6
+ s2[0] = vaddq_s16(s1[0], s1[7]);
+ s2[1] = vaddq_s16(s1[0], s1[6]);
+ s2[2] = vaddq_s16(s1[0], s1[5]);
+ s2[3] = vaddq_s16(s1[0], s1[4]);
+ s2[4] = vsubq_s16(s1[0], s1[4]);
+ s2[5] = vsubq_s16(s1[0], s1[5]);
+ s2[6] = vsubq_s16(s1[0], s1[6]);
+ s2[7] = vsubq_s16(s1[0], s1[7]);
+
+ s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64);
+ s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64);
+
+ s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64);
+ s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64);
+
+ s1[16] = vaddq_s16(s2[16], s2[23]);
+ s1[17] = vaddq_s16(s2[17], s2[22]);
+ s2[18] = vaddq_s16(s1[18], s1[21]);
+ s2[19] = vaddq_s16(s1[19], s1[20]);
+ s2[20] = vsubq_s16(s1[19], s1[20]);
+ s2[21] = vsubq_s16(s1[18], s1[21]);
+ s1[22] = vsubq_s16(s2[17], s2[22]);
+ s1[23] = vsubq_s16(s2[16], s2[23]);
+
+ s3[24] = vsubq_s16(s2[31], s2[24]);
+ s3[25] = vsubq_s16(s2[30], s2[25]);
+ s3[26] = vsubq_s16(s1[29], s1[26]);
+ s3[27] = vsubq_s16(s1[28], s1[27]);
+ s2[28] = vaddq_s16(s1[27], s1[28]);
+ s2[29] = vaddq_s16(s1[26], s1[29]);
+ s2[30] = vaddq_s16(s2[25], s2[30]);
+ s2[31] = vaddq_s16(s2[24], s2[31]);
+
+ // stage 7
+ s1[0] = vaddq_s16(s2[0], s1[15]);
+ s1[1] = vaddq_s16(s2[1], s1[14]);
+ s1[2] = vaddq_s16(s2[2], s2[13]);
+ s1[3] = vaddq_s16(s2[3], s2[12]);
+ s1[4] = vaddq_s16(s2[4], s2[11]);
+ s1[5] = vaddq_s16(s2[5], s2[10]);
+ s1[6] = vaddq_s16(s2[6], s1[9]);
+ s1[7] = vaddq_s16(s2[7], s1[8]);
+ s1[8] = vsubq_s16(s2[7], s1[8]);
+ s1[9] = vsubq_s16(s2[6], s1[9]);
+ s1[10] = vsubq_s16(s2[5], s2[10]);
+ s1[11] = vsubq_s16(s2[4], s2[11]);
+ s1[12] = vsubq_s16(s2[3], s2[12]);
+ s1[13] = vsubq_s16(s2[2], s2[13]);
+ s1[14] = vsubq_s16(s2[1], s1[14]);
+ s1[15] = vsubq_s16(s2[0], s1[15]);
+
+ s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
+ s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
+
+ s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
+ s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
+
+ s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64);
+ s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64);
+
+ s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64);
+ s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64);
+
+ // final stage
+ out[0] = final_add(s1[0], s2[31]);
+ out[1] = final_add(s1[1], s2[30]);
+ out[2] = final_add(s1[2], s2[29]);
+ out[3] = final_add(s1[3], s2[28]);
+ out[4] = final_add(s1[4], s1[27]);
+ out[5] = final_add(s1[5], s1[26]);
+ out[6] = final_add(s1[6], s1[25]);
+ out[7] = final_add(s1[7], s1[24]);
+ out[8] = final_add(s1[8], s2[23]);
+ out[9] = final_add(s1[9], s2[22]);
+ out[10] = final_add(s1[10], s1[21]);
+ out[11] = final_add(s1[11], s1[20]);
+ out[12] = final_add(s1[12], s2[19]);
+ out[13] = final_add(s1[13], s2[18]);
+ out[14] = final_add(s1[14], s1[17]);
+ out[15] = final_add(s1[15], s1[16]);
+ out[16] = final_sub(s1[15], s1[16]);
+ out[17] = final_sub(s1[14], s1[17]);
+ out[18] = final_sub(s1[13], s2[18]);
+ out[19] = final_sub(s1[12], s2[19]);
+ out[20] = final_sub(s1[11], s1[20]);
+ out[21] = final_sub(s1[10], s1[21]);
+ out[22] = final_sub(s1[9], s2[22]);
+ out[23] = final_sub(s1[8], s2[23]);
+ out[24] = final_sub(s1[7], s1[24]);
+ out[25] = final_sub(s1[6], s1[25]);
+ out[26] = final_sub(s1[5], s1[26]);
+ out[27] = final_sub(s1[4], s1[27]);
+ out[28] = final_sub(s1[3], s2[28]);
+ out[29] = final_sub(s1[2], s2[29]);
+ out[30] = final_sub(s1[1], s2[30]);
+ out[31] = final_sub(s1[0], s2[31]);
+
+ if (highbd_flag) {
+ highbd_add_and_store_bd8(out, output, stride);
+ } else {
+ uint8_t *const outputT = (uint8_t *)output;
+ add_and_store_u8_s16(out + 0, outputT, stride);
+ add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
+ add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
+ add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
+ }
+}
+
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ int16_t temp[32 * 8];
+ int16_t *t = temp;
+
+ vpx_idct32_6_neon(input, t);
+
+ for (i = 0; i < 32; i += 8) {
+ vpx_idct32_8_neon(t, dest, stride, 0);
+ t += (8 * 8);
+ dest += 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
new file mode 100644
index 0000000000..9f4589ea96
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_from_transformed(const int16_t *const trans_buf,
+ const int first, const int second,
+ int16x8_t *const q0,
+ int16x8_t *const q1) {
+ *q0 = vld1q_s16(trans_buf + first * 8);
+ *q1 = vld1q_s16(trans_buf + second * 8);
+}
+
+static INLINE void load_from_output(const int16_t *const out, const int first,
+ const int second, int16x8_t *const q0,
+ int16x8_t *const q1) {
+ *q0 = vld1q_s16(out + first * 32);
+ *q1 = vld1q_s16(out + second * 32);
+}
+
+static INLINE void store_in_output(int16_t *const out, const int first,
+ const int second, const int16x8_t q0,
+ const int16x8_t q1) {
+ vst1q_s16(out + first * 32, q0);
+ vst1q_s16(out + second * 32, q1);
+}
+
+static INLINE void store_combine_results(uint8_t *p1, uint8_t *p2,
+ const int stride, int16x8_t q0,
+ int16x8_t q1, int16x8_t q2,
+ int16x8_t q3) {
+ uint8x8_t d[4];
+
+ d[0] = vld1_u8(p1);
+ p1 += stride;
+ d[1] = vld1_u8(p1);
+ d[3] = vld1_u8(p2);
+ p2 -= stride;
+ d[2] = vld1_u8(p2);
+
+ q0 = vrshrq_n_s16(q0, 6);
+ q1 = vrshrq_n_s16(q1, 6);
+ q2 = vrshrq_n_s16(q2, 6);
+ q3 = vrshrq_n_s16(q3, 6);
+
+ q0 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q0), d[0]));
+ q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), d[1]));
+ q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), d[2]));
+ q3 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q3), d[3]));
+
+ d[0] = vqmovun_s16(q0);
+ d[1] = vqmovun_s16(q1);
+ d[2] = vqmovun_s16(q2);
+ d[3] = vqmovun_s16(q3);
+
+ vst1_u8(p1, d[1]);
+ p1 -= stride;
+ vst1_u8(p1, d[0]);
+ vst1_u8(p2, d[2]);
+ p2 += stride;
+ vst1_u8(p2, d[3]);
+}
+
+static INLINE void highbd_store_combine_results_bd8(uint16_t *p1, uint16_t *p2,
+ const int stride,
+ int16x8_t q0, int16x8_t q1,
+ int16x8_t q2,
+ int16x8_t q3) {
+ uint16x8_t d[4];
+
+ d[0] = vld1q_u16(p1);
+ p1 += stride;
+ d[1] = vld1q_u16(p1);
+ d[3] = vld1q_u16(p2);
+ p2 -= stride;
+ d[2] = vld1q_u16(p2);
+
+ q0 = vrshrq_n_s16(q0, 6);
+ q1 = vrshrq_n_s16(q1, 6);
+ q2 = vrshrq_n_s16(q2, 6);
+ q3 = vrshrq_n_s16(q3, 6);
+
+ q0 = vaddq_s16(q0, vreinterpretq_s16_u16(d[0]));
+ q1 = vaddq_s16(q1, vreinterpretq_s16_u16(d[1]));
+ q2 = vaddq_s16(q2, vreinterpretq_s16_u16(d[2]));
+ q3 = vaddq_s16(q3, vreinterpretq_s16_u16(d[3]));
+
+ d[0] = vmovl_u8(vqmovun_s16(q0));
+ d[1] = vmovl_u8(vqmovun_s16(q1));
+ d[2] = vmovl_u8(vqmovun_s16(q2));
+ d[3] = vmovl_u8(vqmovun_s16(q3));
+
+ vst1q_u16(p1, d[1]);
+ p1 -= stride;
+ vst1q_u16(p1, d[0]);
+ vst1q_u16(p2, d[2]);
+ p2 += stride;
+ vst1q_u16(p2, d[3]);
+}
+
+static INLINE void do_butterfly(const int16x8_t qIn0, const int16x8_t qIn1,
+ const int16_t first_const,
+ const int16_t second_const,
+ int16x8_t *const qOut0,
+ int16x8_t *const qOut1) {
+ int32x4_t q[4];
+ int16x4_t d[6];
+
+ d[0] = vget_low_s16(qIn0);
+ d[1] = vget_high_s16(qIn0);
+ d[2] = vget_low_s16(qIn1);
+ d[3] = vget_high_s16(qIn1);
+
+ // Note: using v{mul, mla, mls}l_n_s16 here slows down 35% with gcc 4.9.
+ d[4] = vdup_n_s16(first_const);
+ d[5] = vdup_n_s16(second_const);
+
+ q[0] = vmull_s16(d[0], d[4]);
+ q[1] = vmull_s16(d[1], d[4]);
+ q[0] = vmlsl_s16(q[0], d[2], d[5]);
+ q[1] = vmlsl_s16(q[1], d[3], d[5]);
+
+ q[2] = vmull_s16(d[0], d[5]);
+ q[3] = vmull_s16(d[1], d[5]);
+ q[2] = vmlal_s16(q[2], d[2], d[4]);
+ q[3] = vmlal_s16(q[3], d[3], d[4]);
+
+ *qOut0 = vcombine_s16(vrshrn_n_s32(q[0], DCT_CONST_BITS),
+ vrshrn_n_s32(q[1], DCT_CONST_BITS));
+ *qOut1 = vcombine_s16(vrshrn_n_s32(q[2], DCT_CONST_BITS),
+ vrshrn_n_s32(q[3], DCT_CONST_BITS));
+}
+
+static INLINE void load_s16x8q(const int16_t *in, int16x8_t *const s0,
+ int16x8_t *const s1, int16x8_t *const s2,
+ int16x8_t *const s3, int16x8_t *const s4,
+ int16x8_t *const s5, int16x8_t *const s6,
+ int16x8_t *const s7) {
+ *s0 = vld1q_s16(in);
+ in += 32;
+ *s1 = vld1q_s16(in);
+ in += 32;
+ *s2 = vld1q_s16(in);
+ in += 32;
+ *s3 = vld1q_s16(in);
+ in += 32;
+ *s4 = vld1q_s16(in);
+ in += 32;
+ *s5 = vld1q_s16(in);
+ in += 32;
+ *s6 = vld1q_s16(in);
+ in += 32;
+ *s7 = vld1q_s16(in);
+}
+
+static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1,
+ int16x8_t a2, int16x8_t a3,
+ int16x8_t a4, int16x8_t a5,
+ int16x8_t a6, int16x8_t a7,
+ int16_t **out) {
+ transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ vst1q_s16(*out, a0);
+ *out += 8;
+ vst1q_s16(*out, a1);
+ *out += 8;
+ vst1q_s16(*out, a2);
+ *out += 8;
+ vst1q_s16(*out, a3);
+ *out += 8;
+ vst1q_s16(*out, a4);
+ *out += 8;
+ vst1q_s16(*out, a5);
+ *out += 8;
+ vst1q_s16(*out, a6);
+ *out += 8;
+ vst1q_s16(*out, a7);
+ *out += 8;
+}
+
+static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
+ int i;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void load_s16x8q_tran_low(
+ const tran_low_t *in, int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4,
+ int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7) {
+ *s0 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s1 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s2 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s3 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s4 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s5 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s6 = load_tran_low_to_s16q(in);
+ in += 32;
+ *s7 = load_tran_low_to_s16q(in);
+}
+
+static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input,
+ int16_t *t_buf) {
+ int i;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ for (i = 0; i < 4; i++, input += 8) {
+ load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+ }
+}
+#else // !CONFIG_VP9_HIGHBITDEPTH
+#define idct32_transpose_pair_tran_low idct32_transpose_pair
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void idct32_bands_end_1st_pass(int16_t *const out,
+ int16x8_t *const q) {
+ store_in_output(out, 16, 17, q[6], q[7]);
+ store_in_output(out, 14, 15, q[8], q[9]);
+
+ load_from_output(out, 30, 31, &q[0], &q[1]);
+ q[4] = vaddq_s16(q[2], q[1]);
+ q[5] = vaddq_s16(q[3], q[0]);
+ q[6] = vsubq_s16(q[3], q[0]);
+ q[7] = vsubq_s16(q[2], q[1]);
+ store_in_output(out, 30, 31, q[6], q[7]);
+ store_in_output(out, 0, 1, q[4], q[5]);
+
+ load_from_output(out, 12, 13, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[10], q[1]);
+ q[3] = vaddq_s16(q[11], q[0]);
+ q[4] = vsubq_s16(q[11], q[0]);
+ q[5] = vsubq_s16(q[10], q[1]);
+
+ load_from_output(out, 18, 19, &q[0], &q[1]);
+ q[8] = vaddq_s16(q[4], q[1]);
+ q[9] = vaddq_s16(q[5], q[0]);
+ q[6] = vsubq_s16(q[5], q[0]);
+ q[7] = vsubq_s16(q[4], q[1]);
+ store_in_output(out, 18, 19, q[6], q[7]);
+ store_in_output(out, 12, 13, q[8], q[9]);
+
+ load_from_output(out, 28, 29, &q[0], &q[1]);
+ q[4] = vaddq_s16(q[2], q[1]);
+ q[5] = vaddq_s16(q[3], q[0]);
+ q[6] = vsubq_s16(q[3], q[0]);
+ q[7] = vsubq_s16(q[2], q[1]);
+ store_in_output(out, 28, 29, q[6], q[7]);
+ store_in_output(out, 2, 3, q[4], q[5]);
+
+ load_from_output(out, 10, 11, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[12], q[1]);
+ q[3] = vaddq_s16(q[13], q[0]);
+ q[4] = vsubq_s16(q[13], q[0]);
+ q[5] = vsubq_s16(q[12], q[1]);
+
+ load_from_output(out, 20, 21, &q[0], &q[1]);
+ q[8] = vaddq_s16(q[4], q[1]);
+ q[9] = vaddq_s16(q[5], q[0]);
+ q[6] = vsubq_s16(q[5], q[0]);
+ q[7] = vsubq_s16(q[4], q[1]);
+ store_in_output(out, 20, 21, q[6], q[7]);
+ store_in_output(out, 10, 11, q[8], q[9]);
+
+ load_from_output(out, 26, 27, &q[0], &q[1]);
+ q[4] = vaddq_s16(q[2], q[1]);
+ q[5] = vaddq_s16(q[3], q[0]);
+ q[6] = vsubq_s16(q[3], q[0]);
+ q[7] = vsubq_s16(q[2], q[1]);
+ store_in_output(out, 26, 27, q[6], q[7]);
+ store_in_output(out, 4, 5, q[4], q[5]);
+
+ load_from_output(out, 8, 9, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[14], q[1]);
+ q[3] = vaddq_s16(q[15], q[0]);
+ q[4] = vsubq_s16(q[15], q[0]);
+ q[5] = vsubq_s16(q[14], q[1]);
+
+ load_from_output(out, 22, 23, &q[0], &q[1]);
+ q[8] = vaddq_s16(q[4], q[1]);
+ q[9] = vaddq_s16(q[5], q[0]);
+ q[6] = vsubq_s16(q[5], q[0]);
+ q[7] = vsubq_s16(q[4], q[1]);
+ store_in_output(out, 22, 23, q[6], q[7]);
+ store_in_output(out, 8, 9, q[8], q[9]);
+
+ load_from_output(out, 24, 25, &q[0], &q[1]);
+ q[4] = vaddq_s16(q[2], q[1]);
+ q[5] = vaddq_s16(q[3], q[0]);
+ q[6] = vsubq_s16(q[3], q[0]);
+ q[7] = vsubq_s16(q[2], q[1]);
+ store_in_output(out, 24, 25, q[6], q[7]);
+ store_in_output(out, 6, 7, q[4], q[5]);
+}
+
+static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
+ uint8_t *const dest,
+ const int stride,
+ int16x8_t *const q) {
+ uint8_t *dest0 = dest + 0 * stride;
+ uint8_t *dest1 = dest + 31 * stride;
+ uint8_t *dest2 = dest + 16 * stride;
+ uint8_t *dest3 = dest + 15 * stride;
+ const int str2 = stride << 1;
+
+ store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 30, 31, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 12, 13, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[10], q[1]);
+ q[3] = vaddq_s16(q[11], q[0]);
+ q[4] = vsubq_s16(q[11], q[0]);
+ q[5] = vsubq_s16(q[10], q[1]);
+
+ load_from_output(out, 18, 19, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 28, 29, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 10, 11, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[12], q[1]);
+ q[3] = vaddq_s16(q[13], q[0]);
+ q[4] = vsubq_s16(q[13], q[0]);
+ q[5] = vsubq_s16(q[12], q[1]);
+
+ load_from_output(out, 20, 21, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 26, 27, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 8, 9, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[14], q[1]);
+ q[3] = vaddq_s16(q[15], q[0]);
+ q[4] = vsubq_s16(q[15], q[0]);
+ q[5] = vsubq_s16(q[14], q[1]);
+
+ load_from_output(out, 22, 23, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+
+ load_from_output(out, 24, 25, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+}
+
+static INLINE void highbd_idct32_bands_end_2nd_pass_bd8(
+ const int16_t *const out, uint16_t *const dest, const int stride,
+ int16x8_t *const q) {
+ uint16_t *dest0 = dest + 0 * stride;
+ uint16_t *dest1 = dest + 31 * stride;
+ uint16_t *dest2 = dest + 16 * stride;
+ uint16_t *dest3 = dest + 15 * stride;
+ const int str2 = stride << 1;
+
+ highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+ q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 30, 31, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+ q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 12, 13, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[10], q[1]);
+ q[3] = vaddq_s16(q[11], q[0]);
+ q[4] = vsubq_s16(q[11], q[0]);
+ q[5] = vsubq_s16(q[10], q[1]);
+
+ load_from_output(out, 18, 19, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+ q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 28, 29, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+ q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 10, 11, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[12], q[1]);
+ q[3] = vaddq_s16(q[13], q[0]);
+ q[4] = vsubq_s16(q[13], q[0]);
+ q[5] = vsubq_s16(q[12], q[1]);
+
+ load_from_output(out, 20, 21, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+ q[9]);
+ dest2 += str2;
+ dest3 -= str2;
+
+ load_from_output(out, 26, 27, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+ q[7]);
+ dest0 += str2;
+ dest1 -= str2;
+
+ load_from_output(out, 8, 9, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[14], q[1]);
+ q[3] = vaddq_s16(q[15], q[0]);
+ q[4] = vsubq_s16(q[15], q[0]);
+ q[5] = vsubq_s16(q[14], q[1]);
+
+ load_from_output(out, 22, 23, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+ highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+ q[9]);
+
+ load_from_output(out, 24, 25, &q[0], &q[1]);
+ q[4] = final_add(q[2], q[1]);
+ q[5] = final_add(q[3], q[0]);
+ q[6] = final_sub(q[3], q[0]);
+ q[7] = final_sub(q[2], q[1]);
+ highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+ q[7]);
+}
+
+void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
+ const int stride, const int highbd_flag) {
+ int i, idct32_pass_loop;
+ int16_t trans_buf[32 * 8];
+ int16_t pass1[32 * 32];
+ int16_t pass2[32 * 32];
+ const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1
+ int16_t *out;
+ int16x8_t q[16];
+ uint16_t *dst = CAST_TO_SHORTPTR(dest);
+
+ for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+ idct32_pass_loop++, out = pass2) {
+ for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop
+ if (idct32_pass_loop == 0) {
+ idct32_transpose_pair_tran_low(input, trans_buf);
+ input += 32 * 8;
+ } else {
+ idct32_transpose_pair(input_pass2, trans_buf);
+ input_pass2 += 32 * 8;
+ }
+
+ // -----------------------------------------
+ // BLOCK A: 16-19,28-31
+ // -----------------------------------------
+ // generate 16,17,30,31
+ // part of stage 1
+ load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
+ // part of stage 2
+ q[4] = vaddq_s16(q[0], q[1]);
+ q[13] = vsubq_s16(q[0], q[1]);
+ q[6] = vaddq_s16(q[2], q[3]);
+ q[14] = vsubq_s16(q[2], q[3]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
+
+ // generate 18,19,28,29
+ // part of stage 1
+ load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
+ // part of stage 2
+ q[13] = vsubq_s16(q[3], q[2]);
+ q[3] = vaddq_s16(q[3], q[2]);
+ q[14] = vsubq_s16(q[1], q[0]);
+ q[2] = vaddq_s16(q[1], q[0]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
+ // part of stage 4
+ q[8] = vaddq_s16(q[4], q[2]);
+ q[9] = vaddq_s16(q[5], q[0]);
+ q[10] = vaddq_s16(q[7], q[1]);
+ q[15] = vaddq_s16(q[6], q[3]);
+ q[13] = vsubq_s16(q[5], q[0]);
+ q[14] = vsubq_s16(q[7], q[1]);
+ store_in_output(out, 16, 31, q[8], q[15]);
+ store_in_output(out, 17, 30, q[9], q[10]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
+ store_in_output(out, 29, 18, q[1], q[0]);
+ // part of stage 4
+ q[13] = vsubq_s16(q[4], q[2]);
+ q[14] = vsubq_s16(q[6], q[3]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
+ store_in_output(out, 19, 28, q[4], q[6]);
+
+ // -----------------------------------------
+ // BLOCK B: 20-23,24-27
+ // -----------------------------------------
+ // generate 20,21,26,27
+ // part of stage 1
+ load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
+ // part of stage 2
+ q[13] = vsubq_s16(q[0], q[1]);
+ q[0] = vaddq_s16(q[0], q[1]);
+ q[14] = vsubq_s16(q[2], q[3]);
+ q[2] = vaddq_s16(q[2], q[3]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+
+ // generate 22,23,24,25
+ // part of stage 1
+ load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
+ // part of stage 2
+ q[14] = vsubq_s16(q[4], q[5]);
+ q[5] = vaddq_s16(q[4], q[5]);
+ q[13] = vsubq_s16(q[6], q[7]);
+ q[6] = vaddq_s16(q[6], q[7]);
+ // part of stage 3
+ do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
+ // part of stage 4
+ q[10] = vaddq_s16(q[7], q[1]);
+ q[11] = vaddq_s16(q[5], q[0]);
+ q[12] = vaddq_s16(q[6], q[2]);
+ q[15] = vaddq_s16(q[4], q[3]);
+ // part of stage 6
+ load_from_output(out, 16, 17, &q[14], &q[13]);
+ q[8] = vaddq_s16(q[14], q[11]);
+ q[9] = vaddq_s16(q[13], q[10]);
+ q[13] = vsubq_s16(q[13], q[10]);
+ q[11] = vsubq_s16(q[14], q[11]);
+ store_in_output(out, 17, 16, q[9], q[8]);
+ load_from_output(out, 30, 31, &q[14], &q[9]);
+ q[8] = vsubq_s16(q[9], q[12]);
+ q[10] = vaddq_s16(q[14], q[15]);
+ q[14] = vsubq_s16(q[14], q[15]);
+ q[12] = vaddq_s16(q[9], q[12]);
+ store_in_output(out, 30, 31, q[10], q[12]);
+ // part of stage 7
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 25, 22, q[14], q[13]);
+ do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 24, 23, q[14], q[13]);
+ // part of stage 4
+ q[14] = vsubq_s16(q[5], q[0]);
+ q[13] = vsubq_s16(q[6], q[2]);
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
+ q[14] = vsubq_s16(q[7], q[1]);
+ q[13] = vsubq_s16(q[4], q[3]);
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
+ // part of stage 6
+ load_from_output(out, 18, 19, &q[14], &q[13]);
+ q[8] = vaddq_s16(q[14], q[1]);
+ q[9] = vaddq_s16(q[13], q[6]);
+ q[13] = vsubq_s16(q[13], q[6]);
+ q[1] = vsubq_s16(q[14], q[1]);
+ store_in_output(out, 18, 19, q[8], q[9]);
+ load_from_output(out, 28, 29, &q[8], &q[9]);
+ q[14] = vsubq_s16(q[8], q[5]);
+ q[10] = vaddq_s16(q[8], q[5]);
+ q[11] = vaddq_s16(q[9], q[0]);
+ q[0] = vsubq_s16(q[9], q[0]);
+ store_in_output(out, 28, 29, q[10], q[11]);
+ // part of stage 7
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+ store_in_output(out, 20, 27, q[13], q[14]);
+ do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
+ store_in_output(out, 21, 26, q[1], q[0]);
+
+ // -----------------------------------------
+ // BLOCK C: 8-10,11-15
+ // -----------------------------------------
+ // generate 8,9,14,15
+ // part of stage 2
+ load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
+ // part of stage 3
+ q[13] = vsubq_s16(q[0], q[1]);
+ q[0] = vaddq_s16(q[0], q[1]);
+ q[14] = vsubq_s16(q[2], q[3]);
+ q[2] = vaddq_s16(q[2], q[3]);
+ // part of stage 4
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
+
+ // generate 10,11,12,13
+ // part of stage 2
+ load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
+ // part of stage 3
+ q[14] = vsubq_s16(q[4], q[5]);
+ q[5] = vaddq_s16(q[4], q[5]);
+ q[13] = vsubq_s16(q[6], q[7]);
+ q[6] = vaddq_s16(q[6], q[7]);
+ // part of stage 4
+ do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
+ // part of stage 5
+ q[8] = vaddq_s16(q[0], q[5]);
+ q[9] = vaddq_s16(q[1], q[7]);
+ q[13] = vsubq_s16(q[1], q[7]);
+ q[14] = vsubq_s16(q[3], q[4]);
+ q[10] = vaddq_s16(q[3], q[4]);
+ q[15] = vaddq_s16(q[2], q[6]);
+ store_in_output(out, 8, 15, q[8], q[15]);
+ store_in_output(out, 9, 14, q[9], q[10]);
+ // part of stage 6
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+ store_in_output(out, 13, 10, q[3], q[1]);
+ q[13] = vsubq_s16(q[0], q[5]);
+ q[14] = vsubq_s16(q[2], q[6]);
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+ store_in_output(out, 11, 12, q[1], q[3]);
+
+ // -----------------------------------------
+ // BLOCK D: 0-3,4-7
+ // -----------------------------------------
+ // generate 4,5,6,7
+ // part of stage 3
+ load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
+ load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+ // part of stage 4
+ q[13] = vsubq_s16(q[0], q[1]);
+ q[0] = vaddq_s16(q[0], q[1]);
+ q[14] = vsubq_s16(q[2], q[3]);
+ q[2] = vaddq_s16(q[2], q[3]);
+ // part of stage 5
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+
+ // generate 0,1,2,3
+ // part of stage 4
+ load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
+ load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
+ do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
+ // part of stage 5
+ q[4] = vaddq_s16(q[7], q[6]);
+ q[7] = vsubq_s16(q[7], q[6]);
+ q[6] = vsubq_s16(q[5], q[14]);
+ q[5] = vaddq_s16(q[5], q[14]);
+ // part of stage 6
+ q[8] = vaddq_s16(q[4], q[2]);
+ q[9] = vaddq_s16(q[5], q[3]);
+ q[10] = vaddq_s16(q[6], q[1]);
+ q[11] = vaddq_s16(q[7], q[0]);
+ q[12] = vsubq_s16(q[7], q[0]);
+ q[13] = vsubq_s16(q[6], q[1]);
+ q[14] = vsubq_s16(q[5], q[3]);
+ q[15] = vsubq_s16(q[4], q[2]);
+ // part of stage 7
+ load_from_output(out, 14, 15, &q[0], &q[1]);
+ q[2] = vaddq_s16(q[8], q[1]);
+ q[3] = vaddq_s16(q[9], q[0]);
+ q[4] = vsubq_s16(q[9], q[0]);
+ q[5] = vsubq_s16(q[8], q[1]);
+ load_from_output(out, 16, 17, &q[0], &q[1]);
+ q[8] = final_add(q[4], q[1]);
+ q[9] = final_add(q[5], q[0]);
+ q[6] = final_sub(q[5], q[0]);
+ q[7] = final_sub(q[4], q[1]);
+
+ if (idct32_pass_loop == 0) {
+ idct32_bands_end_1st_pass(out, q);
+ } else {
+ if (highbd_flag) {
+ highbd_idct32_bands_end_2nd_pass_bd8(out, dst, stride, q);
+ dst += 8;
+ } else {
+ idct32_bands_end_2nd_pass(out, dest, stride, q);
+ dest += 8;
+ }
+ }
+ }
+ }
+}
+
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ vpx_idct32_32_neon(input, dest, stride, 0);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
new file mode 100644
index 0000000000..d83421e9e6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
@@ -0,0 +1,66 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vpx_idct4x4_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int stride)
+
+|vpx_idct4x4_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; cospi_16_64 = 11585
+ movw r12, #0x2d41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 4)
+ add r0, r0, #8 ; + (1 <<((4) - 1))
+ asr r0, r0, #4 ; >> 4
+
+ vdup.s16 q0, r0 ; duplicate a1
+
+ vld1.32 {d2[0]}, [r1], r2
+ vld1.32 {d2[1]}, [r1], r2
+ vld1.32 {d4[0]}, [r1], r2
+ vld1.32 {d4[1]}, [r1]
+
+ vaddw.u8 q8, q0, d2 ; dest[x] + a1
+ vaddw.u8 q9, q0, d4
+
+ vqmovun.s16 d6, q8 ; clip_pixel
+ vqmovun.s16 d7, q9
+
+ vst1.32 {d6[0]}, [r12], r2
+ vst1.32 {d6[1]}, [r12], r2
+ vst1.32 {d7[0]}, [r12], r2
+ vst1.32 {d7[1]}, [r12]
+
+ bx lr
+ ENDP ; |vpx_idct4x4_1_add_neon|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
new file mode 100644
index 0000000000..a14b895431
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
+ const int16x8_t res,
+ uint32x2_t *const d) {
+ uint16x8_t a;
+ uint8x8_t b;
+ *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0);
+ *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1);
+ a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d));
+ b = vqmovun_s16(vreinterpretq_s16_u16(a));
+ vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0);
+ *dest += stride;
+ vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1);
+ *dest += stride;
+}
+
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ uint32x2_t d = vdup_n_u32(0);
+
+ assert(!((intptr_t)dest % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ idct4x4_1_add_kernel(&dest, stride, dc, &d);
+ idct4x4_1_add_kernel(&dest, stride, dc, &d);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
new file mode 100644
index 0000000000..175ba7fbc2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -0,0 +1,188 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_idct4x4_16_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
+ AREA Block, CODE, READONLY
+;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int stride)
+
+|vpx_idct4x4_16_add_neon| PROC
+
+ ; The 2D transform is done with two passes which are actually pretty
+ ; similar. We first transform the rows. This is done by transposing
+ ; the inputs, doing an SIMD column transform (the columns are the
+ ; transposed rows) and then transpose the results (so that it goes back
+ ; in normal/row positions). Then, we transform the columns by doing
+ ; another SIMD column transform.
+ ; So, two passes of a transpose followed by a column transform.
+
+ ; load the inputs into q8-q9, d16-d19
+ LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
+
+ ; generate scalar constants
+ ; cospi_8_64 = 15137
+ movw r0, #0x3b21
+ ; cospi_16_64 = 11585
+ movw r3, #0x2d41
+ ; cospi_24_64 = 6270
+ movw r12, #0x187e
+
+ ; transpose the input data
+ ; 00 01 02 03 d16
+ ; 10 11 12 13 d17
+ ; 20 21 22 23 d18
+ ; 30 31 32 33 d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+
+ ; generate constant vectors
+ vdup.16 d20, r0 ; replicate cospi_8_64
+ vdup.16 d21, r3 ; replicate cospi_16_64
+
+ ; 00 10 02 12 d16
+ ; 01 11 03 13 d17
+ ; 20 30 22 32 d18
+ ; 21 31 23 33 d19
+ vtrn.32 q8, q9
+ ; 00 10 20 30 d16
+ ; 01 11 21 31 d17
+ ; 02 12 22 32 d18
+ ; 03 13 23 33 d19
+
+ vdup.16 d22, r12 ; replicate cospi_24_64
+
+ ; do the transform on transposed rows
+
+ ; stage 1
+ vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
+ vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
+
+ ; (input[0] + input[2]) * cospi_16_64;
+ ; (input[0] - input[2]) * cospi_16_64;
+ vmull.s16 q8, d16, d21
+ vmull.s16 q14, d18, d21
+ vadd.s32 q13, q8, q14
+ vsub.s32 q14, q8, q14
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ vmlsl.s16 q15, d19, d20
+ vmlal.s16 q1, d19, d22
+
+ ; dct_const_round_shift
+ vrshrn.s32 d26, q13, #14
+ vrshrn.s32 d27, q14, #14
+ vrshrn.s32 d29, q15, #14
+ vrshrn.s32 d28, q1, #14
+
+ ; stage 2
+ ; output[0] = step[0] + step[3];
+ ; output[1] = step[1] + step[2];
+ ; output[3] = step[0] - step[3];
+ ; output[2] = step[1] - step[2];
+ vadd.s16 q8, q13, q14
+ vsub.s16 q9, q13, q14
+ vswp d18, d19
+
+ ; transpose the results
+ ; 00 01 02 03 d16
+ ; 10 11 12 13 d17
+ ; 20 21 22 23 d18
+ ; 30 31 32 33 d19
+ vtrn.16 d16, d17
+ vtrn.16 d18, d19
+ ; 00 10 02 12 d16
+ ; 01 11 03 13 d17
+ ; 20 30 22 32 d18
+ ; 21 31 23 33 d19
+ vtrn.32 q8, q9
+ ; 00 10 20 30 d16
+ ; 01 11 21 31 d17
+ ; 02 12 22 32 d18
+ ; 03 13 23 33 d19
+
+ ; do the transform on columns
+
+ ; stage 1
+ vadd.s16 d23, d16, d18 ; (input[0] + input[2])
+ vsub.s16 d24, d16, d18 ; (input[0] - input[2])
+
+ vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
+ vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
+
+ ; (input[0] + input[2]) * cospi_16_64;
+ ; (input[0] - input[2]) * cospi_16_64;
+ vmull.s16 q13, d23, d21
+ vmull.s16 q14, d24, d21
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ vmlsl.s16 q15, d19, d20
+ vmlal.s16 q1, d19, d22
+
+ ; dct_const_round_shift
+ vrshrn.s32 d26, q13, #14
+ vrshrn.s32 d27, q14, #14
+ vrshrn.s32 d29, q15, #14
+ vrshrn.s32 d28, q1, #14
+
+ ; stage 2
+ ; output[0] = step[0] + step[3];
+ ; output[1] = step[1] + step[2];
+ ; output[3] = step[0] - step[3];
+ ; output[2] = step[1] - step[2];
+ vadd.s16 q8, q13, q14
+ vsub.s16 q9, q13, q14
+
+ ; The results are in two registers, one of them being swapped. This will
+ ; be taken care of by loading the 'dest' value in a swapped fashion and
+ ; also storing them in the same swapped fashion.
+ ; temp_out[0, 1] = d16, d17 = q8
+ ; temp_out[2, 3] = d19, d18 = q9 swapped
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4)
+ vrshr.s16 q8, q8, #4
+ vrshr.s16 q9, q9, #4
+
+ vld1.32 {d26[0]}, [r1], r2
+ vld1.32 {d26[1]}, [r1], r2
+ vld1.32 {d27[1]}, [r1], r2
+ vld1.32 {d27[0]}, [r1] ; no post-increment
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
+ vaddw.u8 q8, q8, d26
+ vaddw.u8 q9, q9, d27
+
+ ; clip_pixel
+ vqmovun.s16 d26, q8
+ vqmovun.s16 d27, q9
+
+ ; do the stores in reverse order with negative post-increment, by changing
+ ; the sign of the stride
+ rsb r2, r2, #0
+ vst1.32 {d27[0]}, [r1], r2
+ vst1.32 {d27[1]}, [r1], r2
+ vst1.32 {d26[1]}, [r1], r2
+ vst1.32 {d26[0]}, [r1] ; no post-increment
+ bx lr
+ ENDP ; |vpx_idct4x4_16_add_neon|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
new file mode 100644
index 0000000000..8192ee4cf8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const uint8_t *dst = dest;
+ uint32x2_t s32 = vdup_n_u32(0);
+ int16x8_t a[2];
+ uint8x8_t s, d[2];
+ uint16x8_t sum[2];
+
+ assert(!((intptr_t)dest % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ // Rows
+ a[0] = load_tran_low_to_s16q(input);
+ a[1] = load_tran_low_to_s16q(input + 8);
+ transpose_idct4x4_16_bd8(a);
+
+ // Columns
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+ transpose_idct4x4_16_bd8(a);
+ a[0] = vrshrq_n_s16(a[0], 4);
+ a[1] = vrshrq_n_s16(a[1], 4);
+
+ s = load_u8(dst, stride);
+ dst += 2 * stride;
+ // The elements are loaded in reverse order.
+ s32 = vld1_lane_u32((const uint32_t *)dst, s32, 1);
+ dst += stride;
+ s32 = vld1_lane_u32((const uint32_t *)dst, s32, 0);
+
+ sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s);
+ sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), vreinterpret_u8_u32(s32));
+ d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+ d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
+
+ store_u8(dest, stride, d[0]);
+ dest += 2 * stride;
+ // The elements are stored in reverse order.
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 1);
+ dest += stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 0);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
new file mode 100644
index 0000000000..ce9b459589
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE uint8x8_t create_dcd(const int16_t dc) {
+ int16x8_t t = vdupq_n_s16(dc);
+ return vqmovun_s16(t);
+}
+
+static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride,
+ const uint8x8_t res) {
+ const uint8x8_t a = vld1_u8(*dest);
+ const uint8x8_t b = vqadd_u8(a, res);
+ vst1_u8(*dest, b);
+ *dest += stride;
+}
+
+static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
+ const uint8x8_t res) {
+ const uint8x8_t a = vld1_u8(*dest);
+ const uint8x8_t b = vqsub_u8(a, res);
+ vst1_u8(*dest, b);
+ *dest += stride;
+}
+
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16_t out0 =
+ WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+ const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
+
+ if (a1 >= 0) {
+ const uint8x8_t dc = create_dcd(a1);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ idct8x8_1_add_pos_kernel(&dest, stride, dc);
+ } else {
+ const uint8x8_t dc = create_dcd(-a1);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
new file mode 100644
index 0000000000..7471387e47
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
+ int16x8_t a[8];
+
+ a[0] = load_tran_low_to_s16q(input);
+ a[1] = load_tran_low_to_s16q(input + 8);
+ a[2] = load_tran_low_to_s16q(input + 16);
+ a[3] = load_tran_low_to_s16q(input + 24);
+ a[4] = load_tran_low_to_s16q(input + 32);
+ a[5] = load_tran_low_to_s16q(input + 40);
+ a[6] = load_tran_low_to_s16q(input + 48);
+ a[7] = load_tran_low_to_s16q(input + 56);
+
+ idct8x8_64_1d_bd8(cospis0, cospis1, a);
+ idct8x8_64_1d_bd8(cospis0, cospis1, a);
+ idct8x8_add8x8_neon(a, dest, stride);
+}
+
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x8_t cospisd = vaddq_s16(cospis, cospis);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24
+ const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28
+ int16x4_t a[8];
+ int16x8_t b[8];
+
+ a[0] = load_tran_low_to_s16d(input);
+ a[1] = load_tran_low_to_s16d(input + 8);
+ a[2] = load_tran_low_to_s16d(input + 16);
+ a[3] = load_tran_low_to_s16d(input + 24);
+
+ idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a);
+ idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b);
+ idct8x8_add8x8_neon(b, dest, stride);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm
new file mode 100644
index 0000000000..5dd9bdc788
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.asm
@@ -0,0 +1,46 @@
+;
+; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ INCLUDE ./vpx_config.asm
+
+ ; Helper functions used to load tran_low_t into int16, narrowing if
+ ; necessary.
+
+ ; $dst0..3 are d registers with the pairs assumed to be contiguous in
+ ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth.
+ MACRO
+ LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src
+ IF CONFIG_VP9_HIGHBITDEPTH
+ vld1.s32 {q0,q1}, [$src]!
+ vld1.s32 {q2,q3}, [$src]!
+ vmovn.i32 $dst0, q0
+ vmovn.i32 $dst1, q1
+ vmovn.i32 $dst2, q2
+ vmovn.i32 $dst3, q3
+ ELSE
+ vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]!
+ ENDIF
+ MEND
+
+ ; $dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth.
+ MACRO
+ LOAD_TRAN_LOW_TO_S16X2 $dst0, $dst1, $dst2, $dst3, $src
+ IF CONFIG_VP9_HIGHBITDEPTH
+ vld2.s32 {q0,q1}, [$src]!
+ vld2.s32 {q2,q3}, [$src]!
+ vmovn.i32 $dst0, q0
+ vmovn.i32 $dst1, q2
+ vmovn.i32 $dst2, q1
+ vmovn.i32 $dst3, q3
+ ELSE
+ vld2.s16 {$dst0,$dst1,$dst2,$dst3}, [$src]!
+ ENDIF
+ MEND
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h
new file mode 100644
index 0000000000..c02311326b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h
@@ -0,0 +1,919 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_IDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static const int16_t kCospi[16] = {
+ 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
+ 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
+ 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
+ -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */,
+ 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */,
+ 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */,
+ 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */,
+ 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
+};
+
+static const int32_t kCospi32[16] = {
+ 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
+ 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
+ 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
+ -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */,
+ 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */,
+ 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */,
+ 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */,
+ 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
+};
+
+//------------------------------------------------------------------------------
+// Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
+static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ return vqaddq_s16(a, b);
+#else
+ return vaddq_s16(a, b);
+#endif
+}
+
+static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ return vqsubq_s16(a, b);
+#else
+ return vsubq_s16(a, b);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0,
+ const int32x4x2_t s1) {
+ int32x4x2_t t;
+ t.val[0] = vaddq_s32(s0.val[0], s1.val[0]);
+ t.val[1] = vaddq_s32(s0.val[1], s1.val[1]);
+ return t;
+}
+
+static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0,
+ const int32x4x2_t s1) {
+ int32x4x2_t t;
+ t.val[0] = vsubq_s32(s0.val[0], s1.val[0]);
+ t.val[1] = vsubq_s32(s0.val[1], s1.val[1]);
+ return t;
+}
+
+//------------------------------------------------------------------------------
+
+static INLINE int16x8_t dct_const_round_shift_low_8(const int32x4_t *const in) {
+ return vcombine_s16(vrshrn_n_s32(in[0], DCT_CONST_BITS),
+ vrshrn_n_s32(in[1], DCT_CONST_BITS));
+}
+
+static INLINE void dct_const_round_shift_low_8_dual(const int32x4_t *const t32,
+ int16x8_t *const d0,
+ int16x8_t *const d1) {
+ *d0 = dct_const_round_shift_low_8(t32 + 0);
+ *d1 = dct_const_round_shift_low_8(t32 + 2);
+}
+
+static INLINE int32x4x2_t
+dct_const_round_shift_high_4x2(const int64x2_t *const in) {
+ int32x4x2_t out;
+ out.val[0] = vcombine_s32(vrshrn_n_s64(in[0], DCT_CONST_BITS),
+ vrshrn_n_s64(in[1], DCT_CONST_BITS));
+ out.val[1] = vcombine_s32(vrshrn_n_s64(in[2], DCT_CONST_BITS),
+ vrshrn_n_s64(in[3], DCT_CONST_BITS));
+ return out;
+}
+
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
+static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
+ const int16_t a_const) {
+ // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed
+ // streams. See WRAPLOW and dct_const_round_shift for details.
+ // This instruction doubles the result and returns the high half, essentially
+ // resulting in a right shift by 15. By multiplying the constant first that
+ // becomes a right shift by DCT_CONST_BITS.
+ // The largest possible value used here is
+ // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
+ // within the range of int16_t (+32767 / -32768) even when negated.
+ return vqrdmulhq_n_s16(a, a_const * 2);
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
+static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
+ const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+ // In both add_ and it's pair, sub_, the input for well-formed streams will be
+ // well within 16 bits (input to the idct is the difference between two frames
+ // and will be within -255 to 255, or 9 bits)
+ // However, for inputs over about 25,000 (valid for int16_t, but not for idct
+ // input) this function can not use vaddq_s16.
+ // In order to match existing behavior and intentionally out of range tests,
+ // expand the addition up to 32 bits to prevent truncation.
+ int32x4_t t[2];
+ t[0] = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+ t[1] = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+ t[0] = vmulq_n_s32(t[0], ab_const);
+ t[1] = vmulq_n_s32(t[1], ab_const);
+ return dct_const_round_shift_low_8(t);
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
+ const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+ int32x4_t t[2];
+ t[0] = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+ t[1] = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+ t[0] = vmulq_n_s32(t[0], ab_const);
+ t[1] = vmulq_n_s32(t[1], ab_const);
+ return dct_const_round_shift_low_8(t);
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
+ const int16x8_t a, const int16_t a_const, const int16x8_t b,
+ const int16_t b_const) {
+ int32x4_t t[2];
+ t[0] = vmull_n_s16(vget_low_s16(a), a_const);
+ t[1] = vmull_n_s16(vget_high_s16(a), a_const);
+ t[0] = vmlal_n_s16(t[0], vget_low_s16(b), b_const);
+ t[1] = vmlal_n_s16(t[1], vget_high_s16(b), b_const);
+ return dct_const_round_shift_low_8(t);
+}
+
+//------------------------------------------------------------------------------
+
+// Note: The following 4 functions could use 32-bit operations for bit-depth 10.
+// However, although it's 20% faster with gcc, it's 20% slower with clang.
+// Use 64-bit operations for now.
+
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
+static INLINE int32x4x2_t
+multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) {
+ int64x2_t b[4];
+
+ b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
+ b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
+ b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
+ b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
+ return dct_const_round_shift_high_4x2(b);
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
+static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual(
+ const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
+ int32x4_t t[2];
+ int64x2_t c[4];
+
+ t[0] = vaddq_s32(a.val[0], b.val[0]);
+ t[1] = vaddq_s32(a.val[1], b.val[1]);
+ c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const);
+ c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const);
+ c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const);
+ c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const);
+ return dct_const_round_shift_high_4x2(c);
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual(
+ const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
+ int32x4_t t[2];
+ int64x2_t c[4];
+
+ t[0] = vsubq_s32(a.val[0], b.val[0]);
+ t[1] = vsubq_s32(a.val[1], b.val[1]);
+ c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const);
+ c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const);
+ c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const);
+ c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const);
+ return dct_const_round_shift_high_4x2(c);
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual(
+ const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b,
+ const int32_t b_const) {
+ int64x2_t c[4];
+ c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
+ c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
+ c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
+ c[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
+ c[0] = vmlal_n_s32(c[0], vget_low_s32(b.val[0]), b_const);
+ c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const);
+ c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const);
+ c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const);
+ return dct_const_round_shift_high_4x2(c);
+}
+
+// Shift the output down by 6 and add it to the destination buffer.
+static INLINE void add_and_store_u8_s16(const int16x8_t *const a, uint8_t *d,
+ const int stride) {
+ uint8x8_t b[8];
+ int16x8_t c[8];
+
+ b[0] = vld1_u8(d);
+ d += stride;
+ b[1] = vld1_u8(d);
+ d += stride;
+ b[2] = vld1_u8(d);
+ d += stride;
+ b[3] = vld1_u8(d);
+ d += stride;
+ b[4] = vld1_u8(d);
+ d += stride;
+ b[5] = vld1_u8(d);
+ d += stride;
+ b[6] = vld1_u8(d);
+ d += stride;
+ b[7] = vld1_u8(d);
+ d -= (7 * stride);
+
+ // c = b + (a >> 6)
+ c[0] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[0])), a[0], 6);
+ c[1] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[1])), a[1], 6);
+ c[2] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[2])), a[2], 6);
+ c[3] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[3])), a[3], 6);
+ c[4] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[4])), a[4], 6);
+ c[5] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[5])), a[5], 6);
+ c[6] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[6])), a[6], 6);
+ c[7] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[7])), a[7], 6);
+
+ b[0] = vqmovun_s16(c[0]);
+ b[1] = vqmovun_s16(c[1]);
+ b[2] = vqmovun_s16(c[2]);
+ b[3] = vqmovun_s16(c[3]);
+ b[4] = vqmovun_s16(c[4]);
+ b[5] = vqmovun_s16(c[5]);
+ b[6] = vqmovun_s16(c[6]);
+ b[7] = vqmovun_s16(c[7]);
+
+ vst1_u8(d, b[0]);
+ d += stride;
+ vst1_u8(d, b[1]);
+ d += stride;
+ vst1_u8(d, b[2]);
+ d += stride;
+ vst1_u8(d, b[3]);
+ d += stride;
+ vst1_u8(d, b[4]);
+ d += stride;
+ vst1_u8(d, b[5]);
+ d += stride;
+ vst1_u8(d, b[6]);
+ d += stride;
+ vst1_u8(d, b[7]);
+}
+
+static INLINE uint8x16_t create_dcq(const int16_t dc) {
+ // Clip both sides and gcc may compile to assembly 'usat'.
+ const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc);
+ return vdupq_n_u8((uint8_t)t);
+}
+
+static INLINE void idct4x4_16_kernel_bd8(int16x8_t *const a) {
+ const int16x4_t cospis = vld1_s16(kCospi);
+ int16x4_t b[4];
+ int32x4_t c[4];
+ int16x8_t d[2];
+
+ b[0] = vget_low_s16(a[0]);
+ b[1] = vget_high_s16(a[0]);
+ b[2] = vget_low_s16(a[1]);
+ b[3] = vget_high_s16(a[1]);
+ c[0] = vmull_lane_s16(b[0], cospis, 2);
+ c[2] = vmull_lane_s16(b[1], cospis, 2);
+ c[1] = vsubq_s32(c[0], c[2]);
+ c[0] = vaddq_s32(c[0], c[2]);
+ c[3] = vmull_lane_s16(b[2], cospis, 3);
+ c[2] = vmull_lane_s16(b[2], cospis, 1);
+ c[3] = vmlsl_lane_s16(c[3], b[3], cospis, 1);
+ c[2] = vmlal_lane_s16(c[2], b[3], cospis, 3);
+ dct_const_round_shift_low_8_dual(c, &d[0], &d[1]);
+ a[0] = vaddq_s16(d[0], d[1]);
+ a[1] = vsubq_s16(d[0], d[1]);
+}
+
+static INLINE void transpose_idct4x4_16_bd8(int16x8_t *const a) {
+ transpose_s16_4x4q(&a[0], &a[1]);
+ idct4x4_16_kernel_bd8(a);
+}
+
+static INLINE void idct8x8_12_pass1_bd8(const int16x4_t cospis0,
+ const int16x4_t cospisd0,
+ const int16x4_t cospisd1,
+ int16x4_t *const io) {
+ int16x4_t step1[8], step2[8];
+ int32x4_t t32[2];
+
+ transpose_s16_4x4d(&io[0], &io[1], &io[2], &io[3]);
+
+ // stage 1
+ step1[4] = vqrdmulh_lane_s16(io[1], cospisd1, 3);
+ step1[5] = vqrdmulh_lane_s16(io[3], cospisd1, 2);
+ step1[6] = vqrdmulh_lane_s16(io[3], cospisd1, 1);
+ step1[7] = vqrdmulh_lane_s16(io[1], cospisd1, 0);
+
+ // stage 2
+ step2[1] = vqrdmulh_lane_s16(io[0], cospisd0, 2);
+ step2[2] = vqrdmulh_lane_s16(io[2], cospisd0, 3);
+ step2[3] = vqrdmulh_lane_s16(io[2], cospisd0, 1);
+
+ step2[4] = vadd_s16(step1[4], step1[5]);
+ step2[5] = vsub_s16(step1[4], step1[5]);
+ step2[6] = vsub_s16(step1[7], step1[6]);
+ step2[7] = vadd_s16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vadd_s16(step2[1], step2[3]);
+ step1[1] = vadd_s16(step2[1], step2[2]);
+ step1[2] = vsub_s16(step2[1], step2[2]);
+ step1[3] = vsub_s16(step2[1], step2[3]);
+
+ t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
+ t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
+ t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
+ step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+
+ // stage 4
+ io[0] = vadd_s16(step1[0], step2[7]);
+ io[1] = vadd_s16(step1[1], step1[6]);
+ io[2] = vadd_s16(step1[2], step1[5]);
+ io[3] = vadd_s16(step1[3], step2[4]);
+ io[4] = vsub_s16(step1[3], step2[4]);
+ io[5] = vsub_s16(step1[2], step1[5]);
+ io[6] = vsub_s16(step1[1], step1[6]);
+ io[7] = vsub_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0,
+ const int16x4_t cospisd0,
+ const int16x4_t cospisd1,
+ const int16x4_t *const input,
+ int16x8_t *const output) {
+ int16x8_t in[4];
+ int16x8_t step1[8], step2[8];
+ int32x4_t t32[8];
+
+ transpose_s16_4x8(input[0], input[1], input[2], input[3], input[4], input[5],
+ input[6], input[7], &in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3);
+ step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2);
+ step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1);
+ step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0);
+
+ // stage 2
+ step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2);
+ step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3);
+ step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1);
+
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s16(step2[1], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[1], step2[3]);
+
+ t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
+ t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
+ t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]);
+
+ // stage 4
+ output[0] = vaddq_s16(step1[0], step2[7]);
+ output[1] = vaddq_s16(step1[1], step1[6]);
+ output[2] = vaddq_s16(step1[2], step1[5]);
+ output[3] = vaddq_s16(step1[3], step2[4]);
+ output[4] = vsubq_s16(step1[3], step2[4]);
+ output[5] = vsubq_s16(step1[2], step1[5]);
+ output[6] = vsubq_s16(step1[1], step1[6]);
+ output[7] = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0,
+ const int16x4_t cospis1,
+ int16x8_t *const io) {
+ int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
+ input7h;
+ int16x4_t step1l[4], step1h[4];
+ int16x8_t step1[8], step2[8];
+ int32x4_t t32[8];
+
+ // stage 1
+ input1l = vget_low_s16(io[1]);
+ input1h = vget_high_s16(io[1]);
+ input3l = vget_low_s16(io[3]);
+ input3h = vget_high_s16(io[3]);
+ input5l = vget_low_s16(io[5]);
+ input5h = vget_high_s16(io[5]);
+ input7l = vget_low_s16(io[7]);
+ input7h = vget_high_s16(io[7]);
+ step1l[0] = vget_low_s16(io[0]);
+ step1h[0] = vget_high_s16(io[0]);
+ step1l[1] = vget_low_s16(io[2]);
+ step1h[1] = vget_high_s16(io[2]);
+ step1l[2] = vget_low_s16(io[4]);
+ step1h[2] = vget_high_s16(io[4]);
+ step1l[3] = vget_low_s16(io[6]);
+ step1h[3] = vget_high_s16(io[6]);
+
+ t32[0] = vmull_lane_s16(input1l, cospis1, 3);
+ t32[1] = vmull_lane_s16(input1h, cospis1, 3);
+ t32[2] = vmull_lane_s16(input3l, cospis1, 2);
+ t32[3] = vmull_lane_s16(input3h, cospis1, 2);
+ t32[4] = vmull_lane_s16(input3l, cospis1, 1);
+ t32[5] = vmull_lane_s16(input3h, cospis1, 1);
+ t32[6] = vmull_lane_s16(input1l, cospis1, 0);
+ t32[7] = vmull_lane_s16(input1h, cospis1, 0);
+ t32[0] = vmlsl_lane_s16(t32[0], input7l, cospis1, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], input7h, cospis1, 0);
+ t32[2] = vmlal_lane_s16(t32[2], input5l, cospis1, 1);
+ t32[3] = vmlal_lane_s16(t32[3], input5h, cospis1, 1);
+ t32[4] = vmlsl_lane_s16(t32[4], input5l, cospis1, 2);
+ t32[5] = vmlsl_lane_s16(t32[5], input5h, cospis1, 2);
+ t32[6] = vmlal_lane_s16(t32[6], input7l, cospis1, 3);
+ t32[7] = vmlal_lane_s16(t32[7], input7h, cospis1, 3);
+ dct_const_round_shift_low_8_dual(&t32[0], &step1[4], &step1[5]);
+ dct_const_round_shift_low_8_dual(&t32[4], &step1[6], &step1[7]);
+
+ // stage 2
+ t32[2] = vmull_lane_s16(step1l[0], cospis0, 2);
+ t32[3] = vmull_lane_s16(step1h[0], cospis0, 2);
+ t32[4] = vmull_lane_s16(step1l[1], cospis0, 3);
+ t32[5] = vmull_lane_s16(step1h[1], cospis0, 3);
+ t32[6] = vmull_lane_s16(step1l[1], cospis0, 1);
+ t32[7] = vmull_lane_s16(step1h[1], cospis0, 1);
+ t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2);
+ t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2);
+ t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2);
+ t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2);
+ t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1);
+ t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
+ t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
+ t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
+ dct_const_round_shift_low_8_dual(&t32[0], &step2[0], &step2[1]);
+ dct_const_round_shift_low_8_dual(&t32[4], &step2[2], &step2[3]);
+
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = vaddq_s16(step2[0], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[0], step2[3]);
+
+ t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
+ t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
+ t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+ dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]);
+
+ // stage 4
+ io[0] = vaddq_s16(step1[0], step2[7]);
+ io[1] = vaddq_s16(step1[1], step1[6]);
+ io[2] = vaddq_s16(step1[2], step1[5]);
+ io[3] = vaddq_s16(step1[3], step2[4]);
+ io[4] = vsubq_s16(step1[3], step2[4]);
+ io[5] = vsubq_s16(step1[2], step1[5]);
+ io[6] = vsubq_s16(step1[1], step1[6]);
+ io[7] = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
+ const int16x4_t cospis1,
+ int16x8_t *const io) {
+ transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6],
+ &io[7]);
+ idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io);
+}
+
+static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
+ const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int32x4_t *const t32) {
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x8_t *const d0,
+ int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+ t32[2] = vnegq_s32(t32[2]);
+ t32[3] = vnegq_s32(t32[3]);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_0_8_16_24,
+ int16x8_t *const d0,
+ int16x8_t *const d1) {
+ int32x4_t t32[6];
+
+ t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
+ t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
+ t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+ t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+ t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+ t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_2_30_10_22,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_4_12_20N_28,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_6_26N_14_18N,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 0);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 0);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 0);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 0);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 1);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_2_30_10_22,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_4_12_20N_28,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_6_26N_14_18N,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 2);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 2);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 2);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 2);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 3);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3);
+ dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct16x16_add_stage7(const int16x8_t *const step2,
+ int16x8_t *const out) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // Use saturating add/sub to avoid overflow in 2nd pass
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+#else
+ out[0] = vaddq_s16(step2[0], step2[15]);
+ out[1] = vaddq_s16(step2[1], step2[14]);
+ out[2] = vaddq_s16(step2[2], step2[13]);
+ out[3] = vaddq_s16(step2[3], step2[12]);
+ out[4] = vaddq_s16(step2[4], step2[11]);
+ out[5] = vaddq_s16(step2[5], step2[10]);
+ out[6] = vaddq_s16(step2[6], step2[9]);
+ out[7] = vaddq_s16(step2[7], step2[8]);
+ out[8] = vsubq_s16(step2[7], step2[8]);
+ out[9] = vsubq_s16(step2[6], step2[9]);
+ out[10] = vsubq_s16(step2[5], step2[10]);
+ out[11] = vsubq_s16(step2[4], step2[11]);
+ out[12] = vsubq_s16(step2[3], step2[12]);
+ out[13] = vsubq_s16(step2[2], step2[13]);
+ out[14] = vsubq_s16(step2[1], step2[14]);
+ out[15] = vsubq_s16(step2[0], step2[15]);
+#endif
+}
+
+static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
+ int16_t *output) {
+ // Save the result into output
+ vst1q_s16(output, out[0]);
+ output += 16;
+ vst1q_s16(output, out[1]);
+ output += 16;
+ vst1q_s16(output, out[2]);
+ output += 16;
+ vst1q_s16(output, out[3]);
+ output += 16;
+ vst1q_s16(output, out[4]);
+ output += 16;
+ vst1q_s16(output, out[5]);
+ output += 16;
+ vst1q_s16(output, out[6]);
+ output += 16;
+ vst1q_s16(output, out[7]);
+ output += 16;
+ vst1q_s16(output, out[8]);
+ output += 16;
+ vst1q_s16(output, out[9]);
+ output += 16;
+ vst1q_s16(output, out[10]);
+ output += 16;
+ vst1q_s16(output, out[11]);
+ output += 16;
+ vst1q_s16(output, out[12]);
+ output += 16;
+ vst1q_s16(output, out[13]);
+ output += 16;
+ vst1q_s16(output, out[14]);
+ output += 16;
+ vst1q_s16(output, out[15]);
+}
+
+static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest,
+ const int stride) {
+ const uint8x8_t s = vld1_u8(*dest);
+ const int16x8_t res = vrshrq_n_s16(a, 5);
+ const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
+ vst1_u8(*dest, d);
+ *dest += stride;
+}
+
+static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest,
+ const int stride) {
+ idct8x8_add8x1(out[0], &dest, stride);
+ idct8x8_add8x1(out[1], &dest, stride);
+ idct8x8_add8x1(out[2], &dest, stride);
+ idct8x8_add8x1(out[3], &dest, stride);
+ idct8x8_add8x1(out[4], &dest, stride);
+ idct8x8_add8x1(out[5], &dest, stride);
+ idct8x8_add8x1(out[6], &dest, stride);
+ idct8x8_add8x1(out[7], &dest, stride);
+}
+
+static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest,
+ const int stride) {
+ const uint8x8_t s = vld1_u8(*dest);
+ const int16x8_t res = vrshrq_n_s16(a, 6);
+ const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
+ vst1_u8(*dest, d);
+ *dest += stride;
+}
+
+static INLINE void idct16x16_add_store(const int16x8_t *const out,
+ uint8_t *dest, const int stride) {
+ // Add the result to dest
+ idct16x16_add8x1(out[0], &dest, stride);
+ idct16x16_add8x1(out[1], &dest, stride);
+ idct16x16_add8x1(out[2], &dest, stride);
+ idct16x16_add8x1(out[3], &dest, stride);
+ idct16x16_add8x1(out[4], &dest, stride);
+ idct16x16_add8x1(out[5], &dest, stride);
+ idct16x16_add8x1(out[6], &dest, stride);
+ idct16x16_add8x1(out[7], &dest, stride);
+ idct16x16_add8x1(out[8], &dest, stride);
+ idct16x16_add8x1(out[9], &dest, stride);
+ idct16x16_add8x1(out[10], &dest, stride);
+ idct16x16_add8x1(out[11], &dest, stride);
+ idct16x16_add8x1(out[12], &dest, stride);
+ idct16x16_add8x1(out[13], &dest, stride);
+ idct16x16_add8x1(out[14], &dest, stride);
+ idct16x16_add8x1(out[15], &dest, stride);
+}
+
+static INLINE void highbd_idct16x16_add8x1(const int16x8_t a,
+ const int16x8_t max,
+ uint16_t **const dest,
+ const int stride) {
+ const uint16x8_t s = vld1q_u16(*dest);
+ const int16x8_t res0 = vqaddq_s16(a, vreinterpretq_s16_u16(s));
+ const int16x8_t res1 = vminq_s16(res0, max);
+ const uint16x8_t d = vqshluq_n_s16(res1, 0);
+ vst1q_u16(*dest, d);
+ *dest += stride;
+}
+
+static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest,
+ const int stride) {
+ // Add the result to dest
+ const int16x8_t max = vdupq_n_s16((1 << 8) - 1);
+ out[0] = vrshrq_n_s16(out[0], 6);
+ out[1] = vrshrq_n_s16(out[1], 6);
+ out[2] = vrshrq_n_s16(out[2], 6);
+ out[3] = vrshrq_n_s16(out[3], 6);
+ out[4] = vrshrq_n_s16(out[4], 6);
+ out[5] = vrshrq_n_s16(out[5], 6);
+ out[6] = vrshrq_n_s16(out[6], 6);
+ out[7] = vrshrq_n_s16(out[7], 6);
+ out[8] = vrshrq_n_s16(out[8], 6);
+ out[9] = vrshrq_n_s16(out[9], 6);
+ out[10] = vrshrq_n_s16(out[10], 6);
+ out[11] = vrshrq_n_s16(out[11], 6);
+ out[12] = vrshrq_n_s16(out[12], 6);
+ out[13] = vrshrq_n_s16(out[13], 6);
+ out[14] = vrshrq_n_s16(out[14], 6);
+ out[15] = vrshrq_n_s16(out[15], 6);
+ highbd_idct16x16_add8x1(out[0], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[1], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[2], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[3], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[4], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[5], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[6], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[7], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[8], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[9], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[10], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[11], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[12], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[13], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[14], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[15], max, &dest, stride);
+}
+
+static INLINE void highbd_idct16x16_add8x1_bd8(const int16x8_t a,
+ uint16_t **const dest,
+ const int stride) {
+ const uint16x8_t s = vld1q_u16(*dest);
+ const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), a, 6);
+ const uint16x8_t d = vmovl_u8(vqmovun_s16(res));
+ vst1q_u16(*dest, d);
+ *dest += stride;
+}
+
+static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a,
+ uint16_t *out, const int stride) {
+ highbd_idct16x16_add8x1_bd8(a[0], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[1], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[2], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[3], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[4], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[5], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[6], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[7], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[8], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[9], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[10], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[11], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[12], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[13], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[14], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[15], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[16], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[17], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[18], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[19], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[20], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[21], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[22], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[23], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[24], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[25], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[26], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[27], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[28], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[29], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[30], &out, stride);
+ highbd_idct16x16_add8x1_bd8(a[31], &out, stride);
+}
+
+void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
+ void *const dest, const int stride,
+ const int highbd_flag);
+
+void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
+ void *const dest, const int stride,
+ const int highbd_flag);
+
+void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+ int16_t *output);
+
+void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
+ int16_t *const output, void *const dest,
+ const int stride, const int highbd_flag);
+
+void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
+ const int stride, const int highbd_flag);
+
+void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output);
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
+ const int stride, const int highbd_flag);
+
+void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output);
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
+ const int highbd_flag);
+
+#endif // VPX_VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c
new file mode 100644
index 0000000000..4f909e4935
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c
@@ -0,0 +1,1942 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16_t dc_sum_4(const uint8_t *ref) {
+ return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref));
+}
+
+static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
+ const uint8x8_t dc) {
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0);
+ }
+}
+
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t a = load_unaligned_u8_4x1(above);
+ const uint8x8_t l = load_unaligned_u8_4x1(left);
+ const uint16x4_t al = vget_low_u16(vaddl_u8(a, l));
+ const uint16_t sum = horizontal_add_uint16x4(al);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16_t sum = dc_sum_4(left);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
+ (void)above;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16_t sum = dc_sum_4(above);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
+ (void)left;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t dc = vdup_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16_t dc_sum_8(const uint8_t *ref) {
+ return horizontal_add_uint8x8(vld1_u8(ref));
+}
+
+static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
+ const uint8x8_t dc) {
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ vst1_u8(dst, dc);
+ }
+}
+
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t above_u8 = vld1_u8(above);
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const uint16x8_t al = vaddl_u8(above_u8, left_u8);
+ const uint16_t sum = horizontal_add_uint16x8(al);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4);
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16_t sum = dc_sum_8(left);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
+ (void)above;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16_t sum = dc_sum_8(above);
+ const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
+ (void)left;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t dc = vdup_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_8x8(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16_t dc_sum_16(const uint8_t *ref) {
+ return horizontal_add_uint8x16(vld1q_u8(ref));
+}
+
+static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
+ const uint8x16_t dc) {
+ int i;
+ for (i = 0; i < 16; ++i, dst += stride) {
+ vst1q_u8(dst + 0, dc);
+ }
+}
+
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t ref0 = vld1q_u8(above);
+ const uint8x16_t ref1 = vld1q_u8(left);
+ const uint16x8_t a = vpaddlq_u8(ref0);
+ const uint16x8_t l = vpaddlq_u8(ref1);
+ const uint16x8_t al = vaddq_u16(a, l);
+ const uint16_t sum = horizontal_add_uint16x8(al);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16_t sum = dc_sum_16(left);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
+ (void)above;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16_t sum = dc_sum_16(above);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
+ (void)left;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint8x16_t dc = vdupq_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_16x16(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint16_t dc_sum_32(const uint8_t *ref) {
+ const uint8x16_t r0 = vld1q_u8(ref + 0);
+ const uint8x16_t r1 = vld1q_u8(ref + 16);
+ const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1));
+ return horizontal_add_uint16x8(r01);
+}
+
+static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
+ const uint8x16_t dc) {
+ int i;
+ for (i = 0; i < 32; ++i, dst += stride) {
+ vst1q_u8(dst + 0, dc);
+ vst1q_u8(dst + 16, dc);
+ }
+}
+
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t a0 = vld1q_u8(above + 0);
+ const uint8x16_t a1 = vld1q_u8(above + 16);
+ const uint8x16_t l0 = vld1q_u8(left + 0);
+ const uint8x16_t l1 = vld1q_u8(left + 16);
+ const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1));
+ const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1));
+ const uint16x8_t al = vaddq_u16(a01, l01);
+ const uint16_t sum = horizontal_add_uint16x8(al);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0);
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16_t sum = dc_sum_32(left);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
+ (void)above;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16_t sum = dc_sum_32(above);
+ const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
+ (void)left;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint8x16_t dc = vdupq_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_32x32(dst, stride, dc);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a0, a1, a2, d0;
+ uint8_t a7;
+ (void)left;
+
+ a0 = vld1_u8(above);
+ a7 = above[7];
+
+ // [ above[1], ..., above[6], x, x ]
+ a1 = vext_u8(a0, a0, 1);
+ // [ above[2], ..., above[7], x, x ]
+ a2 = vext_u8(a0, a0, 2);
+
+ // d0[0] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[5] = AVG3(above[5], above[6], above[7]);
+ // d0[6] = x (don't care)
+ // d0[7] = x (don't care)
+ d0 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+ // We want:
+ // stride=0 [ d0[0], d0[1], d0[2], d0[3] ]
+ // stride=1 [ d0[1], d0[2], d0[3], d0[4] ]
+ // stride=2 [ d0[2], d0[3], d0[4], d0[5] ]
+ // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+ store_u8_4x1(dst + 0 * stride, d0);
+ store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1));
+ store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2));
+ store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3));
+
+ // We stored d0[6] above, so fixup into above[7].
+ dst[3 * stride + 3] = a7;
+}
+
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t ax0, a0, a1, a7, d0;
+ (void)left;
+
+ a0 = vld1_u8(above + 0);
+ a1 = vld1_u8(above + 1);
+ a7 = vld1_dup_u8(above + 7);
+
+ // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+ // shift in above[7] later, so shift a0 across by one to get the right
+ // inputs:
+ // [ x, above[0], ... , above[6] ]
+ ax0 = vext_u8(a0, a0, 7);
+
+ // d0[0] = x (don't care)
+ // d0[1] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[7] = AVG3(above[6], above[7], above[8]);
+ d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0);
+
+ // Undo the earlier ext, incrementally shift in duplicates of above[7].
+ vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1));
+ vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2));
+ vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3));
+ vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4));
+ vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5));
+ vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6));
+ vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7));
+ vst1_u8(dst + 7 * stride, a7);
+}
+
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t ax0, a0, a1, a15, d0;
+ (void)left;
+
+ a0 = vld1q_u8(above + 0);
+ a1 = vld1q_u8(above + 1);
+ a15 = vld1q_dup_u8(above + 15);
+
+ // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+ // shift in above[15] later, so shift a0 across by one to get the right
+ // inputs:
+ // [ x, above[0], ... , above[14] ]
+ ax0 = vextq_u8(a0, a0, 15);
+
+ // d0[0] = x (don't care)
+ // d0[1] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[15] = AVG3(above[14], above[15], above[16]);
+ d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+
+ // Undo the earlier ext, incrementally shift in duplicates of above[15].
+ vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1));
+ vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2));
+ vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3));
+ vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4));
+ vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5));
+ vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6));
+ vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7));
+ vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8));
+ vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9));
+ vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10));
+ vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11));
+ vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12));
+ vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13));
+ vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14));
+ vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15));
+ vst1q_u8(dst + 15 * stride, a15);
+}
+
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2];
+ (void)left;
+
+ a0 = vld1q_u8(above + 0);
+ a1 = vld1q_u8(above + 1);
+ a15 = vld1q_u8(above + 15);
+ a16 = vld1q_u8(above + 16);
+ a17 = vld1q_u8(above + 17);
+ a31 = vld1q_dup_u8(above + 31);
+
+ // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+ // shift in above[15] later, so shift a0 across by one to get the right
+ // inputs:
+ // [ x, above[0], ... , above[14] ]
+ ax0 = vextq_u8(a0, a0, 15);
+
+ // d0[0] = x (don't care)
+ // d0[1] = AVG3(above[0], above[1], above[2]);
+ // ...
+ // d0[15] = AVG3(above[14], above[15], above[16]);
+ d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+ d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16);
+
+ // Undo the earlier ext, incrementally shift in duplicates of above[15].
+ vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1));
+ vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1));
+ vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2));
+ vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2));
+ vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3));
+ vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3));
+ vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4));
+ vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4));
+ vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5));
+ vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5));
+ vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6));
+ vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6));
+ vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7));
+ vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7));
+ vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8));
+ vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8));
+ vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9));
+ vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9));
+ vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10));
+ vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10));
+ vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11));
+ vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11));
+ vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12));
+ vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12));
+ vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13));
+ vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13));
+ vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14));
+ vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14));
+ vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15));
+ vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15));
+ vst1q_u8(dst + 15 * stride + 0, d0[1]);
+ vst1q_u8(dst + 15 * stride + 16, a31);
+
+ vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1));
+ vst1q_u8(dst + 16 * stride + 16, a31);
+ vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2));
+ vst1q_u8(dst + 17 * stride + 16, a31);
+ vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3));
+ vst1q_u8(dst + 18 * stride + 16, a31);
+ vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4));
+ vst1q_u8(dst + 19 * stride + 16, a31);
+ vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5));
+ vst1q_u8(dst + 20 * stride + 16, a31);
+ vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6));
+ vst1q_u8(dst + 21 * stride + 16, a31);
+ vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7));
+ vst1q_u8(dst + 22 * stride + 16, a31);
+ vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8));
+ vst1q_u8(dst + 23 * stride + 16, a31);
+ vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9));
+ vst1q_u8(dst + 24 * stride + 16, a31);
+ vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10));
+ vst1q_u8(dst + 25 * stride + 16, a31);
+ vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11));
+ vst1q_u8(dst + 26 * stride + 16, a31);
+ vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12));
+ vst1q_u8(dst + 27 * stride + 16, a31);
+ vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13));
+ vst1q_u8(dst + 28 * stride + 16, a31);
+ vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14));
+ vst1q_u8(dst + 29 * stride + 16, a31);
+ vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15));
+ vst1q_u8(dst + 30 * stride + 16, a31);
+ vst1q_u8(dst + 31 * stride + 0, a31);
+ vst1q_u8(dst + 31 * stride + 16, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3;
+ (void)left;
+
+ a0 = load_unaligned_u8_4x1(above + 0);
+ a1 = load_unaligned_u8_4x1(above + 1);
+ a2 = load_unaligned_u8_4x1(above + 2);
+ a3 = load_unaligned_u8_4x1(above + 3);
+
+ d0 = vrhadd_u8(a0, a1);
+ d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+ d2 = vrhadd_u8(a1, a2);
+ d3 = vrhadd_u8(vhadd_u8(a1, a3), a2);
+
+ store_u8_4x1(dst + 0 * stride, d0);
+ store_u8_4x1(dst + 1 * stride, d1);
+ store_u8_4x1(dst + 2 * stride, d2);
+ store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a0, a1, a2, a7, d0, d1;
+ (void)left;
+
+ a0 = vld1_u8(above + 0);
+ a1 = vld1_u8(above + 1);
+ a2 = vld1_u8(above + 2);
+ a7 = vld1_dup_u8(above + 7);
+
+ d0 = vrhadd_u8(a0, a1);
+ d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+ vst1_u8(dst + 0 * stride, d0);
+ vst1_u8(dst + 1 * stride, d1);
+
+ d0 = vext_u8(d0, d0, 7);
+ d1 = vext_u8(d1, d1, 7);
+
+ vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2));
+ vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2));
+ vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3));
+ vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3));
+ vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4));
+ vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4));
+}
+
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t a0, a1, a2, a15, d0, d1;
+ (void)left;
+
+ a0 = vld1q_u8(above + 0);
+ a1 = vld1q_u8(above + 1);
+ a2 = vld1q_u8(above + 2);
+ a15 = vld1q_dup_u8(above + 15);
+
+ d0 = vrhaddq_u8(a0, a1);
+ d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+
+ vst1q_u8(dst + 0 * stride, d0);
+ vst1q_u8(dst + 1 * stride, d1);
+
+ d0 = vextq_u8(d0, d0, 15);
+ d1 = vextq_u8(d1, d1, 15);
+
+ vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2));
+ vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2));
+ vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3));
+ vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3));
+ vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4));
+ vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4));
+ vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5));
+ vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5));
+ vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6));
+ vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6));
+ vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7));
+ vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7));
+ vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8));
+ vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8));
+}
+
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi;
+ (void)left;
+
+ a0 = vld1q_u8(above + 0);
+ a1 = vld1q_u8(above + 1);
+ a2 = vld1q_u8(above + 2);
+ a16 = vld1q_u8(above + 16);
+ a17 = vld1q_u8(above + 17);
+ a18 = vld1q_u8(above + 18);
+ a31 = vld1q_dup_u8(above + 31);
+
+ d0_lo = vrhaddq_u8(a0, a1);
+ d0_hi = vrhaddq_u8(a16, a17);
+ d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+ d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17);
+
+ vst1q_u8(dst + 0 * stride + 0, d0_lo);
+ vst1q_u8(dst + 0 * stride + 16, d0_hi);
+ vst1q_u8(dst + 1 * stride + 0, d1_lo);
+ vst1q_u8(dst + 1 * stride + 16, d1_hi);
+
+ d0_hi = vextq_u8(d0_lo, d0_hi, 15);
+ d0_lo = vextq_u8(d0_lo, d0_lo, 15);
+ d1_hi = vextq_u8(d1_lo, d1_hi, 15);
+ d1_lo = vextq_u8(d1_lo, d1_lo, 15);
+
+ vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2));
+ vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2));
+ vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2));
+ vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2));
+ vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3));
+ vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3));
+ vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3));
+ vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3));
+ vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4));
+ vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4));
+ vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4));
+ vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4));
+ vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5));
+ vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5));
+ vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5));
+ vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5));
+ vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6));
+ vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6));
+ vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6));
+ vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6));
+ vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7));
+ vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7));
+ vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7));
+ vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7));
+ vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8));
+ vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8));
+ vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8));
+ vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8));
+ vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9));
+ vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9));
+ vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9));
+ vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9));
+ vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10));
+ vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10));
+ vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10));
+ vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10));
+ vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11));
+ vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11));
+ vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11));
+ vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11));
+ vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12));
+ vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12));
+ vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12));
+ vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12));
+ vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13));
+ vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13));
+ vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13));
+ vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13));
+ vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14));
+ vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14));
+ vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14));
+ vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14));
+ vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15));
+ vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15));
+ vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15));
+ vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15));
+ vst1q_u8(dst + 30 * stride + 0, d0_hi);
+ vst1q_u8(dst + 30 * stride + 16, a31);
+ vst1q_u8(dst + 31 * stride + 0, d1_hi);
+ vst1q_u8(dst + 31 * stride + 16, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+ uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1;
+
+ az = load_unaligned_u8_4x1(above - 1);
+ a0 = load_unaligned_u8_4x1(above + 0);
+ // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+ l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+ col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2);
+ col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2);
+
+ d0 = vrhadd_u8(az, a0);
+ d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+ d2 = vext_u8(col0, d0, 7);
+ d3 = vext_u8(col1, d1, 7);
+
+ store_u8_4x1(dst + 0 * stride, d0);
+ store_u8_4x1(dst + 1 * stride, d1);
+ store_u8_4x1(dst + 2 * stride, d2);
+ store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+ az = vld1_u8(above - 1);
+ a0 = vld1_u8(above + 0);
+ // [ left[0], above[-1], ... , above[5] ]
+ l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+ l0 = vld1_u8(left + 0);
+ // The last lane here is unused, reading left[8] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[7], x ]
+ l1 = vext_u8(l0, l0, 1);
+ // [ above[-1], left[0], ... , left[6] ]
+ azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+ // d0[0] = AVG2(above[-1], above[0])
+ // d0[1] = AVG2(above[0], above[1])
+ // ...
+ // d0[7] = AVG2(above[6], above[7])
+ d0 = vrhadd_u8(az, a0);
+
+ // d1[0] = AVG3(left[0], above[-1], above[0])
+ // d1[1] = AVG3(above[-1], above[0], above[1])
+ // ...
+ // d1[7] = AVG3(above[5], above[6], above[7])
+ d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+ // The ext instruction shifts elements in from the end of the vector rather
+ // than the start, so reverse the vector to put the elements to be shifted in
+ // at the end. The lowest two lanes here are unused:
+ // col0[7] = AVG3(above[-1], left[0], left[1])
+ // col0[6] = AVG3(left[0], left[1], left[2])
+ // ...
+ // col0[2] = AVG3(left[4], left[5], left[6])
+ // col0[1] = x (don't care)
+ // col0[0] = x (don't care)
+ col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0));
+
+ // We don't care about the first parameter to this uzp since we only ever use
+ // the high three elements, we just use col0 again since it is already
+ // available:
+ // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+ // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+ col0_even = vuzp_u8(col0, col0).val[1];
+ col0_odd = vuzp_u8(col0, col0).val[0];
+
+ // Incrementally shift more elements from col0 into d0/1:
+ // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
+ // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
+ // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ]
+ // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+ // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ]
+ // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ]
+ // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+ // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+ vst1_u8(dst + 0 * stride, d0);
+ vst1_u8(dst + 1 * stride, d1);
+ vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7));
+ vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7));
+ vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6));
+ vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6));
+ vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5));
+ vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5));
+}
+
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+ uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+ az = vld1q_u8(above - 1);
+ a0 = vld1q_u8(above + 0);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+ l0 = vld1q_u8(left + 0);
+ // The last lane here is unused, reading left[16] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[15], x ]
+ l1 = vextq_u8(l0, l0, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+ d0 = vrhaddq_u8(az, a0);
+ d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+
+ col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+ col0 = vrev64q_u8(vextq_u8(col0, col0, 8));
+
+ // The low nine lanes here are unused so the first input to the uzp is
+ // unused, so just use a duplicate of col0 since we have it already. This
+ // also means that the lowest lane of col0 here is unused.
+ col0_even = vuzpq_u8(col0, col0).val[1];
+ col0_odd = vuzpq_u8(col0, col0).val[0];
+
+ vst1q_u8(dst + 0 * stride, d0);
+ vst1q_u8(dst + 1 * stride, d1);
+ vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15));
+ vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15));
+ vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14));
+ vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14));
+ vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13));
+ vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13));
+ vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12));
+ vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12));
+ vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11));
+ vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11));
+ vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10));
+ vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10));
+ vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9));
+ vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9));
+}
+
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+ uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1,
+ l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd;
+
+ az = vld1q_u8(above - 1);
+ a0 = vld1q_u8(above + 0);
+ a14 = vld1q_u8(above + 14);
+ a15 = vld1q_u8(above + 15);
+ a16 = vld1q_u8(above + 16);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+ l0 = vld1q_u8(left + 0);
+ l1 = vld1q_u8(left + 1);
+ l15 = vld1q_u8(left + 15);
+ l16 = vld1q_u8(left + 16);
+ // The last lane here is unused, reading left[32] would cause a buffer
+ // over-read (observed as an address-sanitizer failure), so just fill with a
+ // duplicate of left[16] to avoid needing to materialize a zero:
+ // [ left[17], ... , left[31], x ]
+ l17 = vextq_u8(l16, l16, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+ d0_lo = vrhaddq_u8(az, a0);
+ d0_hi = vrhaddq_u8(a15, a16);
+ d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+ d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+ // The last lane of col0_hi is unused here.
+ col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+ col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+ col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8));
+ col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8));
+
+ // The first lane of these are unused since they are only ever called as
+ // ext(col0, _, i) where i >= 1.
+ col0_even = vuzpq_u8(col0_hi, col0_lo).val[1];
+ col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0];
+
+ vst1q_u8(dst + 0 * stride + 0, d0_lo);
+ vst1q_u8(dst + 0 * stride + 16, d0_hi);
+ vst1q_u8(dst + 1 * stride + 0, d1_lo);
+ vst1q_u8(dst + 1 * stride + 16, d1_hi);
+ vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15));
+ vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15));
+ vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15));
+ vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+ vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14));
+ vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14));
+ vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14));
+ vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14));
+ vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13));
+ vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13));
+ vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13));
+ vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+ vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12));
+ vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12));
+ vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12));
+ vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12));
+ vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11));
+ vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11));
+ vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11));
+ vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+ vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10));
+ vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10));
+ vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10));
+ vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10));
+ vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9));
+ vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9));
+ vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9));
+ vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+ vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8));
+ vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8));
+ vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8));
+ vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8));
+ vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7));
+ vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7));
+ vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7));
+ vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+ vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6));
+ vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6));
+ vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6));
+ vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6));
+ vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5));
+ vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5));
+ vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5));
+ vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+ vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4));
+ vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4));
+ vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4));
+ vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4));
+ vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3));
+ vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3));
+ vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3));
+ vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+ vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2));
+ vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2));
+ vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2));
+ vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2));
+ vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1));
+ vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1));
+ vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1));
+ vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t XA0123 = vld1_u8(above - 1);
+ const uint8x8_t L0123 = vld1_u8(left);
+ const uint8x8_t L3210 = vrev64_u8(L0123);
+ const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
+ const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
+ const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1);
+ const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
+
+ store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3));
+ store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2));
+ store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1));
+ store_u8_4x1(dst + 3 * stride, avg2);
+}
+
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t XA0123456 = vld1_u8(above - 1);
+ const uint8x8_t A01234567 = vld1_u8(above);
+ const uint8x8_t A1234567_ = vld1_u8(above + 1);
+ const uint8x8_t L01234567 = vld1_u8(left);
+ const uint8x8_t L76543210 = vrev64_u8(L01234567);
+ const uint8x8_t L6543210X = vext_u8(L76543210, XA0123456, 1);
+ const uint8x8_t L543210XA0 = vext_u8(L76543210, XA0123456, 2);
+ const uint8x16_t L76543210XA0123456 = vcombine_u8(L76543210, XA0123456);
+ const uint8x16_t L6543210XA01234567 = vcombine_u8(L6543210X, A01234567);
+ const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
+ const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
+ const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
+
+ vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7)));
+ vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6)));
+ vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5)));
+ vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4)));
+ vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3)));
+ vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2)));
+ vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1)));
+ vst1_u8(dst + 7 * stride, vget_low_u8(row));
+}
+
+static INLINE void d135_store_16x8(
+ uint8_t **dst, const ptrdiff_t stride, const uint8x16_t row_0,
+ const uint8x16_t row_1, const uint8x16_t row_2, const uint8x16_t row_3,
+ const uint8x16_t row_4, const uint8x16_t row_5, const uint8x16_t row_6,
+ const uint8x16_t row_7) {
+ vst1q_u8(*dst, row_0);
+ *dst += stride;
+ vst1q_u8(*dst, row_1);
+ *dst += stride;
+ vst1q_u8(*dst, row_2);
+ *dst += stride;
+ vst1q_u8(*dst, row_3);
+ *dst += stride;
+ vst1q_u8(*dst, row_4);
+ *dst += stride;
+ vst1q_u8(*dst, row_5);
+ *dst += stride;
+ vst1q_u8(*dst, row_6);
+ *dst += stride;
+ vst1q_u8(*dst, row_7);
+ *dst += stride;
+}
+
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t XA0123456789abcde = vld1q_u8(above - 1);
+ const uint8x16_t A0123456789abcdef = vld1q_u8(above);
+ const uint8x16_t A123456789abcdef_ = vld1q_u8(above + 1);
+ const uint8x16_t L0123456789abcdef = vld1q_u8(left);
+ const uint8x8_t L76543210 = vrev64_u8(vget_low_u8(L0123456789abcdef));
+ const uint8x8_t Lfedcba98 = vrev64_u8(vget_high_u8(L0123456789abcdef));
+ const uint8x16_t Lfedcba9876543210 = vcombine_u8(Lfedcba98, L76543210);
+ const uint8x16_t Ledcba9876543210X =
+ vextq_u8(Lfedcba9876543210, XA0123456789abcde, 1);
+ const uint8x16_t Ldcba9876543210XA0 =
+ vextq_u8(Lfedcba9876543210, XA0123456789abcde, 2);
+ const uint8x16_t avg_0 = vhaddq_u8(Lfedcba9876543210, Ldcba9876543210XA0);
+ const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
+ const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
+ const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
+
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
+ const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
+ const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
+ const uint8x16_t r_3 = vextq_u8(row_0, row_1, 12);
+ const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
+ const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
+ const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
+ const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8);
+ const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
+ const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
+ const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);
+ const uint8x16_t r_b = vextq_u8(row_0, row_1, 4);
+ const uint8x16_t r_c = vextq_u8(row_0, row_1, 3);
+ const uint8x16_t r_d = vextq_u8(row_0, row_1, 2);
+ const uint8x16_t r_e = vextq_u8(row_0, row_1, 1);
+
+ d135_store_16x8(&dst, stride, r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7);
+ d135_store_16x8(&dst, stride, r_8, r_9, r_a, r_b, r_c, r_d, r_e, row_0);
+}
+
+static INLINE void d135_store_32x2(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x16_t row_0,
+ const uint8x16_t row_1,
+ const uint8x16_t row_2) {
+ uint8_t *dst2 = *dst;
+ vst1q_u8(dst2, row_1);
+ dst2 += 16;
+ vst1q_u8(dst2, row_2);
+ dst2 += 16 * stride - 16;
+ vst1q_u8(dst2, row_0);
+ dst2 += 16;
+ vst1q_u8(dst2, row_1);
+ *dst += stride;
+}
+
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t LL0123456789abcdef = vld1q_u8(left + 16);
+ const uint8x16_t LU0123456789abcdef = vld1q_u8(left);
+ const uint8x8_t LL76543210 = vrev64_u8(vget_low_u8(LL0123456789abcdef));
+ const uint8x8_t LU76543210 = vrev64_u8(vget_low_u8(LU0123456789abcdef));
+ const uint8x8_t LLfedcba98 = vrev64_u8(vget_high_u8(LL0123456789abcdef));
+ const uint8x8_t LUfedcba98 = vrev64_u8(vget_high_u8(LU0123456789abcdef));
+ const uint8x16_t LLfedcba9876543210 = vcombine_u8(LLfedcba98, LL76543210);
+ const uint8x16_t LUfedcba9876543210 = vcombine_u8(LUfedcba98, LU76543210);
+ const uint8x16_t LLedcba9876543210Uf =
+ vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 1);
+ const uint8x16_t LLdcba9876543210Ufe =
+ vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 2);
+ const uint8x16_t avg_0 = vhaddq_u8(LLfedcba9876543210, LLdcba9876543210Ufe);
+ const uint8x16_t row_0 = vrhaddq_u8(avg_0, LLedcba9876543210Uf);
+
+ const uint8x16_t XAL0123456789abcde = vld1q_u8(above - 1);
+ const uint8x16_t LUedcba9876543210X =
+ vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 1);
+ const uint8x16_t LUdcba9876543210XA0 =
+ vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 2);
+ const uint8x16_t avg_1 = vhaddq_u8(LUfedcba9876543210, LUdcba9876543210XA0);
+ const uint8x16_t row_1 = vrhaddq_u8(avg_1, LUedcba9876543210X);
+
+ const uint8x16_t AL0123456789abcdef = vld1q_u8(above);
+ const uint8x16_t AL123456789abcdefg = vld1q_u8(above + 1);
+ const uint8x16_t ALfR0123456789abcde = vld1q_u8(above + 15);
+ const uint8x16_t AR0123456789abcdef = vld1q_u8(above + 16);
+ const uint8x16_t AR123456789abcdef_ = vld1q_u8(above + 17);
+ const uint8x16_t avg_2 = vhaddq_u8(XAL0123456789abcde, AL123456789abcdefg);
+ const uint8x16_t row_2 = vrhaddq_u8(avg_2, AL0123456789abcdef);
+ const uint8x16_t avg_3 = vhaddq_u8(ALfR0123456789abcde, AR123456789abcdef_);
+ const uint8x16_t row_3 = vrhaddq_u8(avg_3, AR0123456789abcdef);
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 15);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 15);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 14);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 14);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 14);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 13);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 13);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 13);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 12);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 12);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 12);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 11);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 11);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 11);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 10);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 10);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 10);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 9);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 9);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 9);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 8);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 8);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 8);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 7);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 7);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 7);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 6);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 6);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 6);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 5);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 5);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 5);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 4);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 4);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 4);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 3);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 3);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 3);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 2);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 2);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 2);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ {
+ const uint8x16_t r_0 = vextq_u8(row_0, row_1, 1);
+ const uint8x16_t r_1 = vextq_u8(row_1, row_2, 1);
+ const uint8x16_t r_2 = vextq_u8(row_2, row_3, 1);
+ d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+ }
+
+ d135_store_32x2(&dst, stride, row_0, row_1, row_2);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+ uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02;
+
+ az = load_unaligned_u8_4x1(above - 1);
+ a0 = load_unaligned_u8_4x1(above + 0);
+ // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+ l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+ l0 = load_unaligned_u8_4x1(left + 0);
+ l1 = load_unaligned_u8_4x1(left + 1);
+ // [ above[-1], left[0], left[1], left[2], x, x, x, x ]
+ azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+ d0 = vrhadd_u8(azl0, l0);
+ d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+ d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+ d02 = vrev64_u8(vzip_u8(d0, d2).val[0]);
+
+ store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7));
+ store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5));
+ store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3));
+ store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1));
+}
+
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+ az = vld1_u8(above - 1);
+ a0 = vld1_u8(above + 0);
+ // [ left[0], above[-1], ... , above[5] ]
+ l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+ l0 = vld1_u8(left);
+ // The last lane here is unused, reading left[8] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[7], x ]
+ l1 = vext_u8(l0, l0, 1);
+ // [ above[-1], left[0], ... , left[6] ]
+ azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+ // d0[0] = AVG2(above[-1], left[0])
+ // d0[1] = AVG2(left[0], left[1])
+ // ...
+ // d0[7] = AVG2(left[6], left[7])
+ d0 = vrhadd_u8(azl0, l0);
+
+ // d1[0] = AVG3(left[0], above[-1], above[0])
+ // d1[1] = AVG3(above[-1], above[0], above[1])
+ // ...
+ // d1[7] = AVG3(above[5], above[6], above[7])
+ d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+ // d2[0] = AVG3(above[-1], left[0], left[1])
+ // d2[1] = AVG3(left[0], left[1], left[2])
+ // ...
+ // d2[6] = AVG3(left[5], left[6], left[7])
+ // d2[7] = x (don't care)
+ d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+ // The ext instruction shifts elements in from the end of the vector rather
+ // than the start, so reverse the vectors to put the elements to be shifted
+ // in at the end. The lowest lane of d02_lo is unused.
+ d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0];
+ d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1];
+
+ // Incrementally shift more elements from d0/d2 reversed into d1:
+ // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+ // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+ // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+ // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+ // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+ // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+ // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+ // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+ vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7));
+ vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5));
+ vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3));
+ vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1));
+ vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7));
+ vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5));
+ vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3));
+ vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+ uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+ az = vld1q_u8(above - 1);
+ a0 = vld1q_u8(above + 0);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+ l0 = vld1q_u8(left + 0);
+ // The last lane here is unused, reading left[16] could cause a buffer
+ // over-read, so just fill with a duplicate of left[0] to avoid needing to
+ // materialize a zero:
+ // [ left[1], ... , left[15], x ]
+ l1 = vextq_u8(l0, l0, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+ d0 = vrhaddq_u8(azl0, l0);
+ d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+ d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+
+ d0 = vrev64q_u8(vextq_u8(d0, d0, 8));
+ d2 = vrev64q_u8(vextq_u8(d2, d2, 8));
+
+ // The lowest lane of d02_lo is unused.
+ d02_lo = vzipq_u8(d2, d0).val[0];
+ d02_hi = vzipq_u8(d2, d0).val[1];
+
+ vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15));
+ vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13));
+ vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11));
+ vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9));
+ vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7));
+ vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5));
+ vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3));
+ vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1));
+ vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15));
+ vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13));
+ vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11));
+ vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9));
+ vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7));
+ vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5));
+ vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3));
+ vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+ uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo,
+ d0_hi, d1_lo, d1_hi, d2_lo, d2_hi;
+ uint8x16x2_t d02_hi, d02_lo;
+
+ az = vld1q_u8(above - 1);
+ a0 = vld1q_u8(above + 0);
+ a14 = vld1q_u8(above + 14);
+ a15 = vld1q_u8(above + 15);
+ a16 = vld1q_u8(above + 16);
+ // [ left[0], above[-1], ... , above[13] ]
+ l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+ l0 = vld1q_u8(left);
+ l1 = vld1q_u8(left + 1);
+ l15 = vld1q_u8(left + 15);
+ l16 = vld1q_u8(left + 16);
+ // The last lane here is unused, reading left[32] would cause a buffer
+ // over-read (observed as an address-sanitizer failure), so just fill with a
+ // duplicate of left[16] to avoid needing to materialize a zero:
+ // [ left[17], ... , left[31], x ]
+ l17 = vextq_u8(l16, l16, 1);
+ // [ above[-1], left[0], ... , left[14] ]
+ azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+ d0_lo = vrhaddq_u8(azl0, l0);
+ d0_hi = vrhaddq_u8(l15, l16);
+
+ d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+ d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+ // The highest lane of d2_hi is unused.
+ d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+ d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+ d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8));
+ d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8));
+
+ d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8));
+ d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8));
+
+ // d02_hi.val[0][0] is unused here.
+ d02_hi = vzipq_u8(d2_hi, d0_hi);
+ d02_lo = vzipq_u8(d2_lo, d0_lo);
+
+ vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15));
+ vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+ vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13));
+ vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+ vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11));
+ vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+ vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9));
+ vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+ vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7));
+ vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+ vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5));
+ vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+ vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3));
+ vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+ vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1));
+ vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
+ vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+ vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15));
+ vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+ vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13));
+ vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+ vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11));
+ vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+ vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9));
+ vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+ vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7));
+ vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+ vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5));
+ vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+ vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3));
+ vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+ vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1));
+ vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+ vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+ vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+ vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+ vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+ vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+ vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+ vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+ vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+ vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+ vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+ vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+ vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+ vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+ vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+ vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+ vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15));
+ vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+ vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13));
+ vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+ vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11));
+ vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+ vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9));
+ vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+ vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7));
+ vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+ vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5));
+ vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+ vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3));
+ vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+ vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1));
+ vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1;
+ (void)above;
+
+ // We need the low half lanes here for the c0/c1 arithmetic but the high half
+ // lanes for the ext:
+ // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ]
+ l0 = load_replicate_u8_4x1(left + 0);
+ l3 = vld1_dup_u8(left + 3);
+
+ // [ left[1], left[2], left[3], left[3], x, x, x, x ]
+ l1 = vext_u8(l0, l3, 5);
+ // [ left[2], left[3], left[3], left[3], x, x, x, x ]
+ l2 = vext_u8(l0, l3, 6);
+
+ c0 = vrhadd_u8(l0, l1);
+ c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+ // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ]
+ c01 = vzip_u8(c0, c1).val[0];
+
+ d0 = c01;
+ d1 = vext_u8(c01, l3, 2);
+
+ // Store the high half of the vector for stride={2,3} to avoid needing
+ // additional ext instructions:
+ // stride=0 [ c0[0], c1[0], c0[1], c1[1] ]
+ // stride=1 [ c0[1], c1[1], c0[2], c1[2] ]
+ // stride=2 [ c0[2], c1[2], c0[3], c1[3] ]
+ // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+ store_u8_4x1(dst + 0 * stride, d0);
+ store_u8_4x1(dst + 1 * stride, d1);
+ store_u8_4x1_high(dst + 2 * stride, d0);
+ store_u8_4x1_high(dst + 3 * stride, d1);
+}
+
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+ (void)above;
+
+ l0 = vld1_u8(left + 0);
+ l7 = vld1_dup_u8(left + 7);
+
+ // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+ l1 = vext_u8(l0, l7, 1);
+ // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+ l2 = vext_u8(l0, l7, 2);
+
+ c0 = vrhadd_u8(l0, l1);
+ c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+ c01_lo = vzip_u8(c0, c1).val[0];
+ c01_hi = vzip_u8(c0, c1).val[1];
+
+ vst1_u8(dst + 0 * stride, c01_lo);
+ vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2));
+ vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4));
+ vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6));
+ vst1_u8(dst + 4 * stride, c01_hi);
+ vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2));
+ vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4));
+ vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6));
+}
+
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+ (void)above;
+
+ l0 = vld1q_u8(left + 0);
+ l15 = vld1q_dup_u8(left + 15);
+
+ l1 = vextq_u8(l0, l15, 1);
+ l2 = vextq_u8(l0, l15, 2);
+
+ c0 = vrhaddq_u8(l0, l1);
+ c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1);
+
+ c01_lo = vzipq_u8(c0, c1).val[0];
+ c01_hi = vzipq_u8(c0, c1).val[1];
+
+ vst1q_u8(dst + 0 * stride, c01_lo);
+ vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2));
+ vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4));
+ vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6));
+ vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8));
+ vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10));
+ vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12));
+ vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14));
+ vst1q_u8(dst + 8 * stride, c01_hi);
+ vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2));
+ vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4));
+ vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6));
+ vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8));
+ vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10));
+ vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12));
+ vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14));
+}
+
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo,
+ c1_hi, c01[4];
+ (void)above;
+
+ l0_lo = vld1q_u8(left + 0);
+ l0_hi = vld1q_u8(left + 16);
+ l31 = vld1q_dup_u8(left + 31);
+
+ l1_lo = vextq_u8(l0_lo, l0_hi, 1);
+ l1_hi = vextq_u8(l0_hi, l31, 1);
+ l2_lo = vextq_u8(l0_lo, l0_hi, 2);
+ l2_hi = vextq_u8(l0_hi, l31, 2);
+
+ c0_lo = vrhaddq_u8(l0_lo, l1_lo);
+ c0_hi = vrhaddq_u8(l0_hi, l1_hi);
+ c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo);
+ c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi);
+
+ c01[0] = vzipq_u8(c0_lo, c1_lo).val[0];
+ c01[1] = vzipq_u8(c0_lo, c1_lo).val[1];
+ c01[2] = vzipq_u8(c0_hi, c1_hi).val[0];
+ c01[3] = vzipq_u8(c0_hi, c1_hi).val[1];
+
+ vst1q_u8(dst + 0 * stride + 0, c01[0]);
+ vst1q_u8(dst + 0 * stride + 16, c01[1]);
+ vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2));
+ vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2));
+ vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4));
+ vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4));
+ vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6));
+ vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6));
+ vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8));
+ vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8));
+ vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10));
+ vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10));
+ vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12));
+ vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12));
+ vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14));
+ vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14));
+ vst1q_u8(dst + 8 * stride + 0, c01[1]);
+ vst1q_u8(dst + 8 * stride + 16, c01[2]);
+ vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2));
+ vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2));
+ vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4));
+ vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4));
+ vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6));
+ vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6));
+ vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8));
+ vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8));
+ vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10));
+ vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10));
+ vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12));
+ vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12));
+ vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14));
+ vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14));
+ vst1q_u8(dst + 16 * stride + 0, c01[2]);
+ vst1q_u8(dst + 16 * stride + 16, c01[3]);
+ vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2));
+ vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2));
+ vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4));
+ vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4));
+ vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6));
+ vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6));
+ vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8));
+ vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8));
+ vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10));
+ vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10));
+ vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12));
+ vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12));
+ vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14));
+ vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14));
+ vst1q_u8(dst + 24 * stride + 0, c01[3]);
+ vst1q_u8(dst + 24 * stride + 16, l31);
+ vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2));
+ vst1q_u8(dst + 25 * stride + 16, l31);
+ vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4));
+ vst1q_u8(dst + 26 * stride + 16, l31);
+ vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6));
+ vst1q_u8(dst + 27 * stride + 16, l31);
+ vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8));
+ vst1q_u8(dst + 28 * stride + 16, l31);
+ vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10));
+ vst1q_u8(dst + 29 * stride + 16, l31);
+ vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12));
+ vst1q_u8(dst + 30 * stride + 16, l31);
+ vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14));
+ vst1q_u8(dst + 31 * stride + 16, l31);
+}
+
+// -----------------------------------------------------------------------------
+
+#if !HAVE_NEON_ASM
+
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint32_t d = *(const uint32_t *)above;
+ int i;
+ (void)left;
+
+ for (i = 0; i < 4; i++, dst += stride) {
+ *(uint32_t *)dst = d;
+ }
+}
+
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d = vld1_u8(above);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 8; i++, dst += stride) {
+ vst1_u8(dst, d);
+ }
+}
+
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d = vld1q_u8(above);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 16; i++, dst += stride) {
+ vst1q_u8(dst, d);
+ }
+}
+
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ int i;
+ (void)left;
+
+ for (i = 0; i < 32; i++) {
+ // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8.
+ // clang-3.8 unrolled the loop fully with no filler so the cause is likely
+ // the latency of the instruction.
+ vst1q_u8(dst, d0);
+ dst += 16;
+ vst1q_u8(dst, d1);
+ dst += stride - 16;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint32x2_t zero = vdup_n_u32(0);
+ const uint8x8_t left_u8 =
+ vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0));
+ uint8x8_t d;
+ (void)above;
+
+ d = vdup_lane_u8(left_u8, 0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 1);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 2);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+}
+
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t left_u8 = vld1_u8(left);
+ uint8x8_t d;
+ (void)above;
+
+ d = vdup_lane_u8(left_u8, 0);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 1);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 2);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 3);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 4);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 5);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 6);
+ vst1_u8(dst, d);
+ dst += stride;
+ d = vdup_lane_u8(left_u8, 7);
+ vst1_u8(dst, d);
+}
+
+static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x8_t left) {
+ const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+ const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+ const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+ const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+ const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+ const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+ const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+ const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+ vst1q_u8(*dst, row_0);
+ *dst += stride;
+ vst1q_u8(*dst, row_1);
+ *dst += stride;
+ vst1q_u8(*dst, row_2);
+ *dst += stride;
+ vst1q_u8(*dst, row_3);
+ *dst += stride;
+ vst1q_u8(*dst, row_4);
+ *dst += stride;
+ vst1q_u8(*dst, row_5);
+ *dst += stride;
+ vst1q_u8(*dst, row_6);
+ *dst += stride;
+ vst1q_u8(*dst, row_7);
+ *dst += stride;
+}
+
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t left_u8q = vld1q_u8(left);
+ (void)above;
+
+ h_store_16x8(&dst, stride, vget_low_u8(left_u8q));
+ h_store_16x8(&dst, stride, vget_high_u8(left_u8q));
+}
+
+static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x8_t left) {
+ const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+ const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+ const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+ const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+ const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+ const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+ const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+ const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+ vst1q_u8(*dst, row_0); // Note clang-3.8 produced poor code w/vst2q_u8
+ *dst += 16;
+ vst1q_u8(*dst, row_0);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_1);
+ *dst += 16;
+ vst1q_u8(*dst, row_1);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_2);
+ *dst += 16;
+ vst1q_u8(*dst, row_2);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_3);
+ *dst += 16;
+ vst1q_u8(*dst, row_3);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_4);
+ *dst += 16;
+ vst1q_u8(*dst, row_4);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_5);
+ *dst += 16;
+ vst1q_u8(*dst, row_5);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_6);
+ *dst += 16;
+ vst1q_u8(*dst, row_6);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_7);
+ *dst += 16;
+ vst1q_u8(*dst, row_7);
+ *dst += stride - 16;
+}
+
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ (void)above;
+
+ for (i = 0; i < 2; i++, left += 16) {
+ const uint8x16_t left_u8 = vld1q_u8(left);
+ h_store_32x8(&dst, stride, vget_low_u8(left_u8));
+ h_store_32x8(&dst, stride, vget_high_u8(left_u8));
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) {
+ return vreinterpretq_s16_u16(vmovl_u8(v));
+}
+
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t top_left = vld1_dup_u8(above - 1);
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const uint8x8_t above_u8 = vld1_u8(above);
+ const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8));
+ int16x8_t sub, sum;
+ uint32x2_t d;
+
+ sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+ // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8.
+ sub = vreinterpretq_s16_s64(
+ vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0));
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+ sum = vaddq_s16(sum, sub);
+ d = vreinterpret_u32_u8(vqmovun_s16(sum));
+ vst1_lane_u32((uint32_t *)dst, d, 0);
+ dst += stride;
+ vst1_lane_u32((uint32_t *)dst, d, 1);
+ dst += stride;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+ sum = vaddq_s16(sum, sub);
+ d = vreinterpret_u32_u8(vqmovun_s16(sum));
+ vst1_lane_u32((uint32_t *)dst, d, 0);
+ dst += stride;
+ vst1_lane_u32((uint32_t *)dst, d, 1);
+}
+
+static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub) {
+ const int16x8_t sum = vaddq_s16(left_dup, sub);
+ const uint8x8_t d = vqmovun_s16(sum);
+ vst1_u8(*dst, d);
+ *dst += stride;
+}
+
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t top_left = vld1_dup_u8(above - 1);
+ const uint8x8_t above_u8 = vld1_u8(above);
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+ const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ int i;
+
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+ int16x8_t left_dup;
+
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_8_kernel(&dst, stride, left_dup, sub);
+ }
+}
+
+static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1) {
+ const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ const uint8x8_t d0 = vqmovun_s16(sum0);
+ const uint8x8_t d1 = vqmovun_s16(sum1);
+ vst1_u8(*dst, d0);
+ *dst += 8;
+ vst1_u8(*dst, d1);
+ *dst += stride - 8;
+}
+
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+ const uint8x16_t above_u8 = vld1q_u8(above);
+ const int16x8_t sub0 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left)));
+ const int16x8_t sub1 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left)));
+ int16x8_t left_dup;
+ int i;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+ const int16x4_t left_low = vget_low_s16(left_s16q);
+ const int16x4_t left_high = vget_high_s16(left_s16q);
+
+ left_dup = vdupq_lane_s16(left_low, 0);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_low, 1);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_low, 2);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_low, 3);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+
+ left_dup = vdupq_lane_s16(left_high, 0);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_high, 1);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_high, 2);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ left_dup = vdupq_lane_s16(left_high, 3);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+ }
+}
+
+static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t sub2,
+ const int16x8_t sub3) {
+ const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ const int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+ const int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+ const uint8x8_t d0 = vqmovun_s16(sum0);
+ const uint8x8_t d1 = vqmovun_s16(sum1);
+ const uint8x8_t d2 = vqmovun_s16(sum2);
+ const uint8x8_t d3 = vqmovun_s16(sum3);
+
+ vst1q_u8(*dst, vcombine_u8(d0, d1));
+ *dst += 16;
+ vst1q_u8(*dst, vcombine_u8(d2, d3));
+ *dst += stride - 16;
+}
+
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+ const uint8x16_t above_low = vld1q_u8(above);
+ const uint8x16_t above_high = vld1q_u8(above + 16);
+ const int16x8_t sub0 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left)));
+ const int16x8_t sub1 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left)));
+ const int16x8_t sub2 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left)));
+ const int16x8_t sub3 = vreinterpretq_s16_u16(
+ vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left)));
+ int16x8_t left_dup;
+ int i, j;
+
+ for (j = 0; j < 4; j++, left += 8) {
+ const uint8x8_t left_u8 = vld1_u8(left);
+ const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+ }
+ }
+}
+#endif // !HAVE_NEON_ASM
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm
new file mode 100644
index 0000000000..115790d480
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm
@@ -0,0 +1,630 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_v_predictor_4x4_neon|
+ EXPORT |vpx_v_predictor_8x8_neon|
+ EXPORT |vpx_v_predictor_16x16_neon|
+ EXPORT |vpx_v_predictor_32x32_neon|
+ EXPORT |vpx_h_predictor_4x4_neon|
+ EXPORT |vpx_h_predictor_8x8_neon|
+ EXPORT |vpx_h_predictor_16x16_neon|
+ EXPORT |vpx_h_predictor_32x32_neon|
+ EXPORT |vpx_tm_predictor_4x4_neon|
+ EXPORT |vpx_tm_predictor_8x8_neon|
+ EXPORT |vpx_tm_predictor_16x16_neon|
+ EXPORT |vpx_tm_predictor_32x32_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_4x4_neon| PROC
+ vld1.32 {d0[0]}, [r2]
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ bx lr
+ ENDP ; |vpx_v_predictor_4x4_neon|
+
+;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_8x8_neon| PROC
+ vld1.8 {d0}, [r2]
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_v_predictor_8x8_neon|
+
+;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_16x16_neon| PROC
+ vld1.8 {q0}, [r2]
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_v_predictor_16x16_neon|
+
+;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_v_predictor_32x32_neon| PROC
+ vld1.8 {q0, q1}, [r2]
+ mov r2, #2
+loop_v
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ subs r2, r2, #1
+ bgt loop_v
+ bx lr
+ ENDP ; |vpx_v_predictor_32x32_neon|
+
+;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_4x4_neon| PROC
+ vld1.32 {d1[0]}, [r3]
+ vdup.8 d0, d1[0]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[1]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[2]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[3]
+ vst1.32 {d0[0]}, [r0], r1
+ bx lr
+ ENDP ; |vpx_h_predictor_4x4_neon|
+
+;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_8x8_neon| PROC
+ vld1.64 {d1}, [r3]
+ vdup.8 d0, d1[0]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[1]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[2]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[3]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[4]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[5]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[6]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[7]
+ vst1.64 {d0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_h_predictor_8x8_neon|
+
+;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_16x16_neon| PROC
+ vld1.8 {q1}, [r3]
+ vdup.8 q0, d2[0]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[1]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[2]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[3]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[4]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[5]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[6]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[7]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[0]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[1]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[2]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[3]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[4]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[5]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[6]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[7]
+ vst1.8 {q0}, [r0], r1
+ bx lr
+ ENDP ; |vpx_h_predictor_16x16_neon|
+
+;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_h_predictor_32x32_neon| PROC
+ sub r1, r1, #16
+ mov r2, #2
+loop_h
+ vld1.8 {q1}, [r3]!
+ vdup.8 q0, d2[0]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[1]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[2]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[3]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[4]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[5]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[6]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[7]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[0]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[1]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[2]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[3]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[4]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[5]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[6]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[7]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ subs r2, r2, #1
+ bgt loop_h
+ bx lr
+ ENDP ; |vpx_h_predictor_32x32_neon|
+
+;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_4x4_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.u8 {d0[]}, [r12]
+
+ ; Load above 4 pixels
+ vld1.32 {d2[0]}, [r2]
+
+ ; Compute above - ytop_left
+ vsubl.u8 q3, d2, d0
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; 1st row and 2nd row
+ vld1.u8 {d2[]}, [r3]!
+ vld1.u8 {d4[]}, [r3]!
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vadd.s16 q1, q1, q3
+ vadd.s16 q2, q2, q3
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+
+ ; 3rd row and 4th row
+ vld1.u8 {d2[]}, [r3]!
+ vld1.u8 {d4[]}, [r3]
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vadd.s16 q1, q1, q3
+ vadd.s16 q2, q2, q3
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ bx lr
+ ENDP ; |vpx_tm_predictor_4x4_neon|
+
+;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_8x8_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; preload 8 left
+ vld1.8 {d30}, [r3]
+
+ ; Load above 8 pixels
+ vld1.64 {d2}, [r2]
+
+ vmovl.u8 q10, d30
+
+ ; Compute above - ytop_left
+ vsubl.u8 q3, d2, d0
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; 1st row and 2nd row
+ vdup.16 q0, d20[0]
+ vdup.16 q1, d20[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
+
+ ; 3rd row and 4th row
+ vdup.16 q8, d20[2]
+ vdup.16 q9, d20[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
+
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d2}, [r0], r1
+ vst1.64 {d3}, [r0], r1
+
+ ; 5th row and 6th row
+ vdup.16 q0, d21[0]
+ vdup.16 q1, d21[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
+
+ ; 7th row and 8th row
+ vdup.16 q8, d21[2]
+ vdup.16 q9, d21[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
+
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d2}, [r0], r1
+ vst1.64 {d3}, [r0], r1
+
+ bx lr
+ ENDP ; |vpx_tm_predictor_8x8_neon|
+
+;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_16x16_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; Load above 8 pixels
+ vld1.8 {q1}, [r2]
+
+ ; preload 8 left into r12
+ vld1.8 {d18}, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q2, d2, d0
+ vsubl.u8 q3, d3, d0
+
+ vmovl.u8 q10, d18
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
+ mov r2, #2
+
+loop_16x16_neon
+ ; Process two rows.
+ vdup.16 q0, d20[0]
+ vdup.16 q8, d20[1]
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d20[2] ; proload next 2 rows data
+ vdup.16 q8, d20[3]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d21[0] ; proload next 2 rows data
+ vdup.16 q8, d21[1]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d21[2] ; proload next 2 rows data
+ vdup.16 q8, d21[3]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vld1.8 {d18}, [r3]! ; preload 8 left into r12
+ vmovl.u8 q10, d18
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_16x16_neon
+
+ bx lr
+ ENDP ; |vpx_tm_predictor_16x16_neon|
+
+;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vpx_tm_predictor_32x32_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ vld1.8 {d0[]}, [r12]
+
+ ; Load above 32 pixels
+ vld1.8 {q1}, [r2]!
+ vld1.8 {q2}, [r2]
+
+ ; preload 8 left pixels
+ vld1.8 {d26}, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q8, d2, d0
+ vsubl.u8 q9, d3, d0
+ vsubl.u8 q10, d4, d0
+ vsubl.u8 q11, d5, d0
+
+ vmovl.u8 q3, d26
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+ mov r2, #4
+
+loop_32x32_neon
+ ; Process two rows.
+ vdup.16 q0, d6[0]
+ vdup.16 q2, d6[1]
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q1, d6[2]
+ vdup.16 q2, d6[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q1, q8
+ vadd.s16 q13, q1, q9
+ vadd.s16 q14, q1, q10
+ vadd.s16 q15, q1, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q0, d7[0]
+ vdup.16 q2, d7[1]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q0, d7[2]
+ vdup.16 q2, d7[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vld1.8 {d0}, [r3]! ; preload 8 left pixels
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vmovl.u8 q3, d0
+ vst1.64 {d24-d27}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_32x32_neon
+
+ bx lr
+ ENDP ; |vpx_tm_predictor_32x32_neon|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
new file mode 100644
index 0000000000..730c40de0e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
@@ -0,0 +1,666 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_lpf_horizontal_16_neon|
+ EXPORT |vpx_lpf_horizontal_16_dual_neon|
+ EXPORT |vpx_lpf_vertical_16_neon|
+ EXPORT |vpx_lpf_vertical_16_dual_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void mb_lpf_horizontal_edge(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; r12 int count
+|mb_lpf_horizontal_edge| PROC
+ push {r4-r8, lr}
+ vpush {d8-d15}
+ ldr r4, [sp, #88] ; load thresh
+
+h_count
+ vld1.8 {d16[]}, [r2] ; load *blimit
+ vld1.8 {d17[]}, [r3] ; load *limit
+ vld1.8 {d18[]}, [r4] ; load *thresh
+
+ sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines
+
+ vld1.u8 {d0}, [r8@64], r1 ; p7
+ vld1.u8 {d1}, [r8@64], r1 ; p6
+ vld1.u8 {d2}, [r8@64], r1 ; p5
+ vld1.u8 {d3}, [r8@64], r1 ; p4
+ vld1.u8 {d4}, [r8@64], r1 ; p3
+ vld1.u8 {d5}, [r8@64], r1 ; p2
+ vld1.u8 {d6}, [r8@64], r1 ; p1
+ vld1.u8 {d7}, [r8@64], r1 ; p0
+ vld1.u8 {d8}, [r8@64], r1 ; q0
+ vld1.u8 {d9}, [r8@64], r1 ; q1
+ vld1.u8 {d10}, [r8@64], r1 ; q2
+ vld1.u8 {d11}, [r8@64], r1 ; q3
+ vld1.u8 {d12}, [r8@64], r1 ; q4
+ vld1.u8 {d13}, [r8@64], r1 ; q5
+ vld1.u8 {d14}, [r8@64], r1 ; q6
+ vld1.u8 {d15}, [r8@64], r1 ; q7
+
+ bl vpx_wide_mbfilter_neon
+
+ tst r7, #1
+ beq h_mbfilter
+
+ ; flat && mask were not set for any of the channels. Just store the values
+ ; from filter.
+ sub r8, r0, r1, lsl #1
+
+ vst1.u8 {d25}, [r8@64], r1 ; store op1
+ vst1.u8 {d24}, [r8@64], r1 ; store op0
+ vst1.u8 {d23}, [r8@64], r1 ; store oq0
+ vst1.u8 {d26}, [r8@64], r1 ; store oq1
+
+ b h_next
+
+h_mbfilter
+ tst r7, #2
+ beq h_wide_mbfilter
+
+ ; flat2 was not set for any of the channels. Just store the values from
+ ; mbfilter.
+ sub r8, r0, r1, lsl #1
+ sub r8, r8, r1
+
+ vst1.u8 {d18}, [r8@64], r1 ; store op2
+ vst1.u8 {d19}, [r8@64], r1 ; store op1
+ vst1.u8 {d20}, [r8@64], r1 ; store op0
+ vst1.u8 {d21}, [r8@64], r1 ; store oq0
+ vst1.u8 {d22}, [r8@64], r1 ; store oq1
+ vst1.u8 {d23}, [r8@64], r1 ; store oq2
+
+ b h_next
+
+h_wide_mbfilter
+ sub r8, r0, r1, lsl #3
+ add r8, r8, r1
+
+ vst1.u8 {d16}, [r8@64], r1 ; store op6
+ vst1.u8 {d24}, [r8@64], r1 ; store op5
+ vst1.u8 {d25}, [r8@64], r1 ; store op4
+ vst1.u8 {d26}, [r8@64], r1 ; store op3
+ vst1.u8 {d27}, [r8@64], r1 ; store op2
+ vst1.u8 {d18}, [r8@64], r1 ; store op1
+ vst1.u8 {d19}, [r8@64], r1 ; store op0
+ vst1.u8 {d20}, [r8@64], r1 ; store oq0
+ vst1.u8 {d21}, [r8@64], r1 ; store oq1
+ vst1.u8 {d22}, [r8@64], r1 ; store oq2
+ vst1.u8 {d23}, [r8@64], r1 ; store oq3
+ vst1.u8 {d1}, [r8@64], r1 ; store oq4
+ vst1.u8 {d2}, [r8@64], r1 ; store oq5
+ vst1.u8 {d3}, [r8@64], r1 ; store oq6
+
+h_next
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne h_count
+
+ vpop {d8-d15}
+ pop {r4-r8, pc}
+
+ ENDP ; |mb_lpf_horizontal_edge|
+
+; void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|vpx_lpf_horizontal_16_neon| PROC
+ mov r12, #1
+ b mb_lpf_horizontal_edge
+ ENDP ; |vpx_lpf_horizontal_16_neon|
+
+; void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|vpx_lpf_horizontal_16_dual_neon| PROC
+ mov r12, #2
+ b mb_lpf_horizontal_edge
+ ENDP ; |vpx_lpf_horizontal_16_dual_neon|
+
+; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+; const uint8_t *limit, const uint8_t *thresh,
+; int count) {
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; r12 int count
+|mb_lpf_vertical_edge_w| PROC
+ push {r4-r8, lr}
+ vpush {d8-d15}
+ ldr r4, [sp, #88] ; load thresh
+
+v_count
+ vld1.8 {d16[]}, [r2] ; load *blimit
+ vld1.8 {d17[]}, [r3] ; load *limit
+ vld1.8 {d18[]}, [r4] ; load *thresh
+
+ sub r8, r0, #8
+
+ vld1.8 {d0}, [r8@64], r1
+ vld1.8 {d8}, [r0@64], r1
+ vld1.8 {d1}, [r8@64], r1
+ vld1.8 {d9}, [r0@64], r1
+ vld1.8 {d2}, [r8@64], r1
+ vld1.8 {d10}, [r0@64], r1
+ vld1.8 {d3}, [r8@64], r1
+ vld1.8 {d11}, [r0@64], r1
+ vld1.8 {d4}, [r8@64], r1
+ vld1.8 {d12}, [r0@64], r1
+ vld1.8 {d5}, [r8@64], r1
+ vld1.8 {d13}, [r0@64], r1
+ vld1.8 {d6}, [r8@64], r1
+ vld1.8 {d14}, [r0@64], r1
+ vld1.8 {d7}, [r8@64], r1
+ vld1.8 {d15}, [r0@64], r1
+
+ sub r0, r0, r1, lsl #3
+
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ vtrn.8 d0, d1
+ vtrn.8 d2, d3
+ vtrn.8 d4, d5
+ vtrn.8 d6, d7
+
+ vtrn.8 d8, d9
+ vtrn.8 d10, d11
+ vtrn.8 d12, d13
+ vtrn.8 d14, d15
+
+ bl vpx_wide_mbfilter_neon
+
+ tst r7, #1
+ beq v_mbfilter
+
+ ; flat && mask were not set for any of the channels. Just store the values
+ ; from filter.
+ sub r0, #2
+
+ vswp d23, d25
+
+ vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r0], r1
+ vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r0], r1
+ vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r0], r1
+ vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r0], r1
+ vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r0], r1
+ vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r0], r1
+ vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r0], r1
+ vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r0], r1
+ add r0, #2
+
+ b v_next
+
+v_mbfilter
+ tst r7, #2
+ beq v_wide_mbfilter
+
+ ; flat2 was not set for any of the channels. Just store the values from
+ ; mbfilter.
+ sub r8, r0, #3
+
+ vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1
+ vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1
+ vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1
+ vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1
+ vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1
+ vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1
+ vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1
+ vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1
+ vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1
+ vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1
+ vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1
+ vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1
+ vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1
+ vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1
+ vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1
+ vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1
+
+ b v_next
+
+v_wide_mbfilter
+ sub r8, r0, #8
+
+ vtrn.32 d0, d26
+ vtrn.32 d16, d27
+ vtrn.32 d24, d18
+ vtrn.32 d25, d19
+
+ vtrn.16 d0, d24
+ vtrn.16 d16, d25
+ vtrn.16 d26, d18
+ vtrn.16 d27, d19
+
+ vtrn.8 d0, d16
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+ vtrn.8 d18, d19
+
+ vtrn.32 d20, d1
+ vtrn.32 d21, d2
+ vtrn.32 d22, d3
+ vtrn.32 d23, d15
+
+ vtrn.16 d20, d22
+ vtrn.16 d21, d23
+ vtrn.16 d1, d3
+ vtrn.16 d2, d15
+
+ vtrn.8 d20, d21
+ vtrn.8 d22, d23
+ vtrn.8 d1, d2
+ vtrn.8 d3, d15
+
+ vst1.8 {d0}, [r8@64], r1
+ vst1.8 {d20}, [r0@64], r1
+ vst1.8 {d16}, [r8@64], r1
+ vst1.8 {d21}, [r0@64], r1
+ vst1.8 {d24}, [r8@64], r1
+ vst1.8 {d22}, [r0@64], r1
+ vst1.8 {d25}, [r8@64], r1
+ vst1.8 {d23}, [r0@64], r1
+ vst1.8 {d26}, [r8@64], r1
+ vst1.8 {d1}, [r0@64], r1
+ vst1.8 {d27}, [r8@64], r1
+ vst1.8 {d2}, [r0@64], r1
+ vst1.8 {d18}, [r8@64], r1
+ vst1.8 {d3}, [r0@64], r1
+ vst1.8 {d19}, [r8@64], r1
+ vst1.8 {d15}, [r0@64], r1
+
+v_next
+ subs r12, #1
+ bne v_count
+
+ vpop {d8-d15}
+ pop {r4-r8, pc}
+
+ ENDP ; |mb_lpf_vertical_edge_w|
+
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+; const uint8_t *limit, const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|vpx_lpf_vertical_16_neon| PROC
+ mov r12, #1
+ b mb_lpf_vertical_edge_w
+ ENDP ; |vpx_lpf_vertical_16_neon|
+
+; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh
+|vpx_lpf_vertical_16_dual_neon| PROC
+ mov r12, #2
+ b mb_lpf_vertical_edge_w
+ ENDP ; |vpx_lpf_vertical_16_dual_neon|
+
+; void vpx_wide_mbfilter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+;
+; r0-r3 PRESERVE
+; d16 blimit
+; d17 limit
+; d18 thresh
+; d0 p7
+; d1 p6
+; d2 p5
+; d3 p4
+; d4 p3
+; d5 p2
+; d6 p1
+; d7 p0
+; d8 q0
+; d9 q1
+; d10 q2
+; d11 q3
+; d12 q4
+; d13 q5
+; d14 q6
+; d15 q7
+|vpx_wide_mbfilter_neon| PROC
+ mov r7, #0
+
+ ; filter_mask
+ vabd.u8 d19, d4, d5 ; abs(p3 - p2)
+ vabd.u8 d20, d5, d6 ; abs(p2 - p1)
+ vabd.u8 d21, d6, d7 ; abs(p1 - p0)
+ vabd.u8 d22, d9, d8 ; abs(q1 - q0)
+ vabd.u8 d23, d10, d9 ; abs(q2 - q1)
+ vabd.u8 d24, d11, d10 ; abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1))
+ vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0))
+ vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2))
+ vmax.u8 d19, d19, d20
+
+ vabd.u8 d24, d7, d8 ; abs(p0 - q0)
+
+ vmax.u8 d19, d19, d23
+
+ vabd.u8 d23, d6, d9 ; a = abs(p1 - q1)
+ vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
+
+ ; abs () > limit
+ vcge.u8 d19, d17, d19
+
+ ; flatmask4
+ vabd.u8 d25, d7, d5 ; abs(p0 - p2)
+ vabd.u8 d26, d8, d10 ; abs(q0 - q2)
+ vabd.u8 d27, d4, d7 ; abs(p3 - p0)
+ vabd.u8 d28, d11, d8 ; abs(q3 - q0)
+
+ ; only compare the largest value to thresh
+ vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2))
+ vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0))
+ vmax.u8 d25, d25, d26
+ vmax.u8 d20, d20, d25
+
+ vshr.u8 d23, d23, #1 ; a = a / 2
+ vqadd.u8 d24, d24, d23 ; a = b + a
+
+ vmov.u8 d30, #1
+ vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1
+
+ vcge.u8 d20, d30, d20 ; flat
+
+ vand d19, d19, d24 ; mask
+
+ ; hevmask
+ vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1
+ vorr d21, d21, d22 ; hev
+
+ vand d16, d20, d19 ; flat && mask
+ vmov r5, r6, d16
+
+ ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+ vabd.u8 d22, d3, d7 ; abs(p4 - p0)
+ vabd.u8 d23, d12, d8 ; abs(q4 - q0)
+ vabd.u8 d24, d7, d2 ; abs(p0 - p5)
+ vabd.u8 d25, d8, d13 ; abs(q0 - q5)
+ vabd.u8 d26, d1, d7 ; abs(p6 - p0)
+ vabd.u8 d27, d14, d8 ; abs(q6 - q0)
+ vabd.u8 d28, d0, d7 ; abs(p7 - p0)
+ vabd.u8 d29, d15, d8 ; abs(q7 - q0)
+
+ ; only compare the largest value to thresh
+ vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0))
+ vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5))
+ vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0))
+ vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0))
+
+ vmax.u8 d26, d22, d23
+ vmax.u8 d27, d24, d25
+ vmax.u8 d23, d26, d27
+
+ vcge.u8 d18, d30, d23 ; flat2
+
+ vmov.u8 d22, #0x80
+
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #1 ; Only do filter branch
+
+ vand d17, d18, d16 ; flat2 && flat && mask
+ vmov r5, r6, d17
+
+ ; mbfilter() function
+
+ ; filter() function
+ ; convert to signed
+ veor d23, d8, d22 ; qs0
+ veor d24, d7, d22 ; ps0
+ veor d25, d6, d22 ; ps1
+ veor d26, d9, d22 ; qs1
+
+ vmov.u8 d27, #3
+
+ vsub.s8 d28, d23, d24 ; ( qs0 - ps0)
+ vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
+ vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
+ vand d29, d29, d21 ; filter &= hev
+ vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
+ vmov.u8 d29, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d28, q15
+
+ vand d28, d28, d19 ; filter &= mask
+
+ vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
+ vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
+ vshr.s8 d30, d30, #3 ; filter2 >>= 3
+ vshr.s8 d29, d29, #3 ; filter1 >>= 3
+
+
+ vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
+ vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1)
+
+ ; outer tap adjustments: ++filter1 >> 1
+ vrshr.s8 d29, d29, #1
+ vbic d29, d29, d21 ; filter &= ~hev
+
+ vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
+ vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
+
+ veor d24, d24, d22 ; *f_op0 = u^0x80
+ veor d23, d23, d22 ; *f_oq0 = u^0x80
+ veor d25, d25, d22 ; *f_op1 = u^0x80
+ veor d26, d26, d22 ; *f_oq1 = u^0x80
+
+ tst r7, #1
+ bxne lr
+
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #2 ; Only do mbfilter branch
+
+ ; mbfilter flat && mask branch
+ ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+ ; and using vibt on the q's?
+ vmov.u8 d29, #2
+ vaddl.u8 q15, d7, d8 ; op2 = p0 + q0
+ vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3
+ vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+ vaddl.u8 q10, d4, d5
+ vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+ vaddl.u8 q14, d6, d9
+ vqrshrn.u16 d18, q15, #3 ; r_op2
+
+ vsub.i16 q15, q10
+ vaddl.u8 q10, d4, d6
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d7, d10
+ vqrshrn.u16 d19, q15, #3 ; r_op1
+
+ vsub.i16 q15, q10
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d8, d11
+ vqrshrn.u16 d20, q15, #3 ; r_op0
+
+ vsubw.u8 q15, d4 ; oq0 = op0 - p3
+ vsubw.u8 q15, d7 ; oq0 -= p0
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d9, d11
+ vqrshrn.u16 d21, q15, #3 ; r_oq0
+
+ vsubw.u8 q15, d5 ; oq1 = oq0 - p2
+ vsubw.u8 q15, d8 ; oq1 -= q0
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d10, d11
+ vqrshrn.u16 d22, q15, #3 ; r_oq1
+
+ vsubw.u8 q15, d6 ; oq2 = oq0 - p1
+ vsubw.u8 q15, d9 ; oq2 -= q1
+ vadd.i16 q15, q14
+ vqrshrn.u16 d27, q15, #3 ; r_oq2
+
+ ; Filter does not set op2 or oq2, so use p2 and q2.
+ vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask)
+ vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask)
+ vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask)
+ vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask)
+ vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+ vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask)
+ vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask)
+
+ tst r7, #2
+ bxne lr
+
+ ; wide_mbfilter flat2 && flat && mask branch
+ vmov.u8 d16, #7
+ vaddl.u8 q15, d7, d8 ; op6 = p0 + q0
+ vaddl.u8 q12, d2, d3
+ vaddl.u8 q13, d4, d5
+ vaddl.u8 q14, d1, d6
+ vmlal.u8 q15, d0, d16 ; op6 += p7 * 3
+ vadd.i16 q12, q13
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d2, d9
+ vadd.i16 q15, q12
+ vaddl.u8 q12, d0, d1
+ vaddw.u8 q15, d1
+ vaddl.u8 q13, d0, d2
+ vadd.i16 q14, q15, q14
+ vqrshrn.u16 d16, q15, #4 ; w_op6
+
+ vsub.i16 q15, q14, q12
+ vaddl.u8 q14, d3, d10
+ vqrshrn.u16 d24, q15, #4 ; w_op5
+
+ vsub.i16 q15, q13
+ vaddl.u8 q13, d0, d3
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d4, d11
+ vqrshrn.u16 d25, q15, #4 ; w_op4
+
+ vadd.i16 q15, q14
+ vaddl.u8 q14, d0, d4
+ vsub.i16 q15, q13
+ vsub.i16 q14, q15, q14
+ vqrshrn.u16 d26, q15, #4 ; w_op3
+
+ vaddw.u8 q15, q14, d5 ; op2 += p2
+ vaddl.u8 q14, d0, d5
+ vaddw.u8 q15, d12 ; op2 += q4
+ vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m)
+ vqrshrn.u16 d27, q15, #4 ; w_op2
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d0, d6
+ vaddw.u8 q15, d6 ; op1 += p1
+ vaddw.u8 q15, d13 ; op1 += q5
+ vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m)
+ vqrshrn.u16 d18, q15, #4 ; w_op1
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d0, d7
+ vaddw.u8 q15, d7 ; op0 += p0
+ vaddw.u8 q15, d14 ; op0 += q6
+ vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m)
+ vqrshrn.u16 d19, q15, #4 ; w_op0
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d1, d8
+ vaddw.u8 q15, d8 ; oq0 += q0
+ vaddw.u8 q15, d15 ; oq0 += q7
+ vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m)
+ vqrshrn.u16 d20, q15, #4 ; w_oq0
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d2, d9
+ vaddw.u8 q15, d9 ; oq1 += q1
+ vaddl.u8 q4, d10, d15
+ vaddw.u8 q15, d15 ; oq1 += q7
+ vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m)
+ vqrshrn.u16 d21, q15, #4 ; w_oq1
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d3, d10
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d11, d15
+ vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m)
+ vqrshrn.u16 d22, q15, #4 ; w_oq2
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d4, d11
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d12, d15
+ vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m)
+ vqrshrn.u16 d23, q15, #4 ; w_oq3
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d5, d12
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d13, d15
+ vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m)
+ vqrshrn.u16 d1, q15, #4 ; w_oq4
+
+ vsub.i16 q15, q14
+ vaddl.u8 q14, d6, d13
+ vadd.i16 q15, q4
+ vaddl.u8 q4, d14, d15
+ vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m)
+ vqrshrn.u16 d2, q15, #4 ; w_oq5
+
+ vsub.i16 q15, q14
+ vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m)
+ vadd.i16 q15, q4
+ vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m)
+ vqrshrn.u16 d3, q15, #4 ; w_oq6
+ vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m)
+ vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m)
+ vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m)
+
+ bx lr
+ ENDP ; |vpx_wide_mbfilter_neon|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
new file mode 100644
index 0000000000..907e918380
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
@@ -0,0 +1,549 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_lpf_horizontal_4_neon|
+ EXPORT |vpx_lpf_vertical_4_neon|
+ EXPORT |vpx_lpf_horizontal_4_dual_neon|
+ EXPORT |vpx_lpf_vertical_4_dual_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+;
+; void vpx_lpf_horizontal_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vpx_lpf_horizontal_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r2, [sp, #4] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+ sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r3, r2, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r2@64], r1 ; p3
+ vld1.u8 {d4}, [r3@64], r1 ; p2
+ vld1.u8 {d5}, [r2@64], r1 ; p1
+ vld1.u8 {d6}, [r3@64], r1 ; p0
+ vld1.u8 {d7}, [r2@64], r1 ; q0
+ vld1.u8 {d16}, [r3@64], r1 ; q1
+ vld1.u8 {d17}, [r2@64] ; q2
+ vld1.u8 {d18}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl filter4_8
+
+ vst1.u8 {d4}, [r2@64], r1 ; store op1
+ vst1.u8 {d5}, [r3@64], r1 ; store op0
+ vst1.u8 {d6}, [r2@64], r1 ; store oq0
+ vst1.u8 {d7}, [r3@64], r1 ; store oq1
+
+ pop {pc}
+ ENDP ; |vpx_lpf_horizontal_4_neon|
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+;
+; void vpx_lpf_vertical_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vpx_lpf_vertical_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #4] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ bl filter4_8
+
+ sub r0, r0, #2
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+ pop {pc}
+ ENDP ; |vpx_lpf_vertical_4_neon|
+
+; void filter4_8();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d4 op1
+; d5 op0
+; d6 oq0
+; d7 oq1
+|filter4_8| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d17, d6, d7 ; abs(p0 - q0)
+
+ vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
+
+ vmov.u8 d18, #0x80
+
+ vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
+
+ ; hevmask
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
+
+ vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
+
+ veor d7, d7, d18 ; qs0
+
+ vcge.u8 d23, d1, d23 ; abs(m1) > limit
+
+ ; filter() function
+ ; convert to signed
+
+ vshr.u8 d28, d28, #1 ; a = a / 2
+ veor d6, d6, d18 ; ps0
+
+ veor d5, d5, d18 ; ps1
+ vqadd.u8 d17, d17, d28 ; a = b + a
+
+ veor d16, d16, d18 ; qs1
+
+ vmov.u8 d19, #3
+
+ vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
+
+ vcge.u8 d17, d0, d17 ; a > blimit
+
+ vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
+ vorr d22, d21, d22 ; hevmask
+
+ vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
+
+ vand d27, d27, d22 ; filter &= hev
+ vand d23, d23, d17 ; filter_mask
+
+ vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d17, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d27, q12
+
+ vand d27, d27, d23 ; filter &= mask
+
+ vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
+ vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
+ vshr.s8 d28, d28, #3 ; filter2 >>= 3
+ vshr.s8 d27, d27, #3 ; filter1 >>= 3
+
+ vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
+ vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
+
+ veor d6, d26, d18 ; *oq0 = u^0x80
+
+ vbic d27, d27, d22 ; filter &= ~hev
+
+ vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
+ vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
+
+ veor d5, d19, d18 ; *op0 = u^0x80
+ veor d4, d21, d18 ; *op1 = u^0x80
+ veor d7, d20, d18 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |filter4_8|
+
+;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int p,
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp+4 const uint8_t *blimit1,
+; sp+8 const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vpx_lpf_horizontal_4_dual_neon| PROC
+ push {lr}
+
+ ldr r12, [sp, #4] ; load thresh0
+ vld1.8 {d0}, [r2] ; load blimit0 to first half q
+ vld1.8 {d2}, [r3] ; load limit0 to first half q
+
+ add r1, r1, r1 ; double pitch
+ ldr r2, [sp, #8] ; load blimit1
+
+ vld1.8 {d4}, [r12] ; load thresh0 to first half q
+
+ ldr r3, [sp, #12] ; load limit1
+ ldr r12, [sp, #16] ; load thresh1
+ vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q
+
+ sub r2, r0, r1, lsl #1 ; s[-4 * p]
+
+ vld1.8 {d3}, [r3] ; load limit1 to 2nd half q
+ vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q
+
+ vpush {d8-d15} ; save neon registers
+
+ add r3, r2, r1, lsr #1 ; s[-3 * p]
+
+ vld1.u8 {q3}, [r2@64], r1 ; p3
+ vld1.u8 {q4}, [r3@64], r1 ; p2
+ vld1.u8 {q5}, [r2@64], r1 ; p1
+ vld1.u8 {q6}, [r3@64], r1 ; p0
+ vld1.u8 {q7}, [r2@64], r1 ; q0
+ vld1.u8 {q8}, [r3@64], r1 ; q1
+ vld1.u8 {q9}, [r2@64] ; q2
+ vld1.u8 {q10}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl filter4_16
+
+ vst1.u8 {q5}, [r2@64], r1 ; store op1
+ vst1.u8 {q6}, [r3@64], r1 ; store op0
+ vst1.u8 {q7}, [r2@64], r1 ; store oq0
+ vst1.u8 {q8}, [r3@64], r1 ; store oq1
+
+ vpop {d8-d15} ; restore neon registers
+
+ pop {pc}
+ ENDP ; |vpx_lpf_horizontal_4_dual_neon|
+
+;void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int p,
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp+4 const uint8_t *blimit1,
+; sp+8 const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vpx_lpf_vertical_4_dual_neon| PROC
+ push {lr}
+
+ ldr r12, [sp, #4] ; load thresh0
+ vld1.8 {d0}, [r2] ; load blimit0 to first half q
+ vld1.8 {d2}, [r3] ; load limit0 to first half q
+
+ ldr r2, [sp, #8] ; load blimit1
+
+ vld1.8 {d4}, [r12] ; load thresh0 to first half q
+
+ ldr r3, [sp, #12] ; load limit1
+ ldr r12, [sp, #16] ; load thresh1
+ vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q
+
+ sub r2, r0, #4 ; s[-4]
+
+ vld1.8 {d3}, [r3] ; load limit1 to 2nd half q
+ vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q
+
+ vpush {d8-d15} ; save neon registers
+
+ vld1.u8 {d6}, [r2], r1 ; 00 01 02 03 04 05 06 07
+ vld1.u8 {d8}, [r2], r1 ; 10 11 12 13 14 15 16 17
+ vld1.u8 {d10}, [r2], r1 ; 20 21 22 23 24 25 26 27
+ vld1.u8 {d12}, [r2], r1 ; 30 31 32 33 34 35 36 37
+ vld1.u8 {d14}, [r2], r1 ; 40 41 42 43 44 45 46 47
+ vld1.u8 {d16}, [r2], r1 ; 50 51 52 53 54 55 56 57
+ vld1.u8 {d18}, [r2], r1 ; 60 61 62 63 64 65 66 67
+ vld1.u8 {d20}, [r2], r1 ; 70 71 72 73 74 75 76 77
+ vld1.u8 {d7}, [r2], r1 ; 80 81 82 83 84 85 86 87
+ vld1.u8 {d9}, [r2], r1 ; 90 91 92 93 94 95 96 97
+ vld1.u8 {d11}, [r2], r1 ; A0 A1 A2 A3 A4 A5 A6 A7
+ vld1.u8 {d13}, [r2], r1 ; B0 B1 B2 B3 B4 B5 B6 B7
+ vld1.u8 {d15}, [r2], r1 ; C0 C1 C2 C3 C4 C5 C6 C7
+ vld1.u8 {d17}, [r2], r1 ; D0 D1 D2 D3 D4 D5 D6 D7
+ vld1.u8 {d19}, [r2], r1 ; E0 E1 E2 E3 E4 E5 E6 E7
+ vld1.u8 {d21}, [r2] ; F0 F1 F2 F3 F4 F5 F6 F7
+
+ vtrn.8 q3, q4 ; q3 : 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
+ ; q4 : 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
+ vtrn.8 q5, q6 ; q5 : 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6
+ ; q6 : 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7
+ vtrn.8 q7, q8 ; q7 : 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6
+ ; q8 : 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7
+ vtrn.8 q9, q10 ; q9 : 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6
+ ; q10: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7
+
+ vtrn.16 q3, q5 ; q3 : 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4
+ ; q5 : 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6
+ vtrn.16 q4, q6 ; q4 : 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5
+ ; q6 : 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7
+ vtrn.16 q7, q9 ; q7 : 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4
+ ; q9 : 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6
+ vtrn.16 q8, q10 ; q8 : 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5
+ ; q10: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7
+
+ vtrn.32 q3, q7 ; q3 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ ; q7 : 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
+ vtrn.32 q5, q9 ; q5 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ ; q9 : 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
+ vtrn.32 q4, q8 ; q4 : 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
+ ; q8 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ vtrn.32 q6, q10 ; q6 : 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
+ ; q10: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+
+ bl filter4_16
+
+ sub r0, #2
+
+ vmov d0, d11
+ vmov d1, d13
+ vmov d2, d15
+ vmov d3, d17
+ vmov d11, d12
+ vmov d12, d14
+ vmov d13, d16
+ vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+ vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+ vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+ vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+ vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r0], r1
+ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r0], r1
+ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r0], r1
+ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r0]
+
+ vpop {d8-d15} ; restore neon registers
+
+ pop {pc}
+ ENDP ; |vpx_lpf_vertical_4_dual_neon|
+
+; void filter4_16();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. This function uses
+; registers d8-d15, so the calling function must save those registers.
+;
+; r0-r3, r12 PRESERVE
+; q0 blimit
+; q1 limit
+; q2 thresh
+; q3 p3
+; q4 p2
+; q5 p1
+; q6 p0
+; q7 q0
+; q8 q1
+; q9 q2
+; q10 q3
+;
+; Outputs:
+; q5 op1
+; q6 op0
+; q7 oq0
+; q8 oq1
+|filter4_16| PROC
+
+ ; filter_mask
+ vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2)
+ vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1)
+ vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0)
+ vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0)
+ vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1)
+ vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 q11, q11, q12 ; m7 = max(m1, m2)
+ vmax.u8 q12, q13, q14 ; m8 = max(m3, m4)
+
+ vabd.u8 q9, q6, q7 ; abs(p0 - q0)
+
+ vmax.u8 q3, q3, q4 ; m9 = max(m5, m6)
+
+ vmov.u8 q10, #0x80
+
+ vmax.u8 q15, q11, q12 ; m10 = max(m7, m8)
+
+ vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 q15, q15, q3 ; m11 = max(m10, m9)
+
+ vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
+ vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
+
+ veor q7, q7, q10 ; qs0
+
+ vcge.u8 q15, q1, q15 ; abs(m11) > limit
+
+ vshr.u8 q2, q2, #1 ; a = a / 2
+ veor q6, q6, q10 ; ps0
+
+ veor q5, q5, q10 ; ps1
+ vqadd.u8 q9, q9, q2 ; a = b + a
+
+ veor q8, q8, q10 ; qs1
+
+ vmov.u16 q4, #3
+
+ vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
+ vsubl.s8 q11, d15, d13
+
+ vcge.u8 q9, q0, q9 ; a > blimit
+
+ vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1)
+ vorr q14, q13, q14 ; hev
+
+ vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
+ vmul.i16 q11, q11, q4
+
+ vand q1, q1, q14 ; filter &= hev
+ vand q15, q15, q9 ; mask
+
+ vmov.u8 q4, #3
+
+ vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0)
+ vaddw.s8 q11, q11, d3
+
+ vmov.u8 q9, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d2, q2
+ vqmovn.s16 d3, q11
+ vand q1, q1, q15 ; filter &= mask
+
+ vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3)
+ vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4)
+ vshr.s8 q2, q2, #3 ; filter2 >>= 3
+ vshr.s8 q1, q1, #3 ; filter1 >>= 3
+
+
+ vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2)
+ vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1
+
+ veor q7, q0, q10 ; *oq0 = u^0x80
+
+ vbic q1, q1, q14 ; filter &= ~hev
+
+ vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter)
+ vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter)
+
+ veor q6, q11, q10 ; *op0 = u^0x80
+ veor q5, q13, q10 ; *op1 = u^0x80
+ veor q8, q12, q10 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |filter4_16|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
new file mode 100644
index 0000000000..a81a9d1013
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -0,0 +1,491 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_lpf_horizontal_8_neon|
+ EXPORT |vpx_lpf_horizontal_8_dual_neon|
+ EXPORT |vpx_lpf_vertical_8_neon|
+ EXPORT |vpx_lpf_vertical_8_dual_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+;
+; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vpx_lpf_horizontal_8_neon| PROC
+ push {r4-r5, lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r2, [sp, #12] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+ sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r2, r3, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r3@64], r1 ; p3
+ vld1.u8 {d4}, [r2@64], r1 ; p2
+ vld1.u8 {d5}, [r3@64], r1 ; p1
+ vld1.u8 {d6}, [r2@64], r1 ; p0
+ vld1.u8 {d7}, [r3@64], r1 ; q0
+ vld1.u8 {d16}, [r2@64], r1 ; q1
+ vld1.u8 {d17}, [r3@64] ; q2
+ vld1.u8 {d18}, [r2@64], r1 ; q3
+
+ sub r3, r3, r1, lsl #1
+ sub r2, r2, r1, lsl #2
+
+ bl vpx_mbloop_filter_neon
+
+ vst1.u8 {d0}, [r2@64], r1 ; store op2
+ vst1.u8 {d1}, [r3@64], r1 ; store op1
+ vst1.u8 {d2}, [r2@64], r1 ; store op0
+ vst1.u8 {d3}, [r3@64], r1 ; store oq0
+ vst1.u8 {d4}, [r2@64], r1 ; store oq1
+ vst1.u8 {d5}, [r3@64], r1 ; store oq2
+
+ pop {r4-r5, pc}
+
+ ENDP ; |vpx_lpf_horizontal_8_neon|
+
+;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
+; int p,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp + 4 const uint8_t *blimit1,
+; sp + 8 const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_horizontal_8_dual_neon| PROC
+ push {r0-r1, lr}
+ ldr lr, [sp, #12]
+ push {lr} ; thresh0
+ bl vpx_lpf_horizontal_8_neon
+
+ ldr r2, [sp, #20] ; blimit1
+ ldr r3, [sp, #24] ; limit1
+ ldr lr, [sp, #28]
+ str lr, [sp, #16] ; thresh1
+ add sp, #4
+ pop {r0-r1, lr}
+ add r0, #8 ; s + 8
+ b vpx_lpf_horizontal_8_neon
+ ENDP ; |vpx_lpf_horizontal_8_dual_neon|
+
+; void vpx_lpf_vertical_8_neon(uint8_t *s,
+; int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+;
+; r0 uint8_t *s,
+; r1 int pitch,
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vpx_lpf_vertical_8_neon| PROC
+ push {r4-r5, lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #12] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ sub r2, r0, #3
+ add r3, r0, #1
+
+ bl vpx_mbloop_filter_neon
+
+ ;store op2, op1, op0, oq0
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+ vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+ vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+ vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+ vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
+ vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
+ vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
+ vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2]
+
+ ;store oq1, oq2
+ vst2.8 {d4[0], d5[0]}, [r3], r1
+ vst2.8 {d4[1], d5[1]}, [r3], r1
+ vst2.8 {d4[2], d5[2]}, [r3], r1
+ vst2.8 {d4[3], d5[3]}, [r3], r1
+ vst2.8 {d4[4], d5[4]}, [r3], r1
+ vst2.8 {d4[5], d5[5]}, [r3], r1
+ vst2.8 {d4[6], d5[6]}, [r3], r1
+ vst2.8 {d4[7], d5[7]}, [r3]
+
+ pop {r4-r5, pc}
+ ENDP ; |vpx_lpf_vertical_8_neon|
+
+;void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
+; int pitch,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int pitch
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp + 4 const uint8_t *blimit1,
+; sp + 8 const uint8_t *limit1,
+; sp + 12 const uint8_t *thresh1,
+|vpx_lpf_vertical_8_dual_neon| PROC
+ push {r0-r1, lr}
+ ldr lr, [sp, #12]
+ push {lr} ; thresh0
+ bl vpx_lpf_vertical_8_neon
+
+ ldr r2, [sp, #20] ; blimit1
+ ldr r3, [sp, #24] ; limit1
+ ldr lr, [sp, #28]
+ str lr, [sp, #16] ; thresh1
+ add sp, #4
+ pop {r0-r1, lr}
+ add r0, r0, r1, lsl #3 ; s + 8 * pitch
+ b vpx_lpf_vertical_8_neon
+ ENDP ; |vpx_lpf_vertical_8_dual_neon|
+
+; void vpx_mbloop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d0 op2
+; d1 op1
+; d2 op0
+; d3 oq0
+; d4 oq1
+; d5 oq2
+|vpx_mbloop_filter_neon| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2)
+
+ vmax.u8 d23, d23, d24 ; m3 = max(m5, m6)
+
+ vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2)
+
+ vmax.u8 d19, d19, d20
+
+ vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0)
+ vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0)
+ vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0)
+
+ vmax.u8 d19, d19, d23
+
+ vabd.u8 d23, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
+
+ ; abs () > limit
+ vcge.u8 d19, d1, d19
+
+ ; only compare the largest value to thresh
+ vmax.u8 d25, d25, d26 ; m4 = max(m7, m8)
+ vmax.u8 d26, d27, d28 ; m5 = max(m10, m11)
+
+ vshr.u8 d23, d23, #1 ; a = a / 2
+
+ vmax.u8 d25, d25, d26 ; m4 = max(m4, m5)
+
+ vqadd.u8 d24, d24, d23 ; a = b + a
+
+ vmax.u8 d20, d20, d25 ; m2 = max(m2, m4)
+
+ vmov.u8 d23, #1
+ vcge.u8 d24, d0, d24 ; a > blimit
+
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+
+ vcge.u8 d20, d23, d20 ; flat
+
+ vand d19, d19, d24 ; mask
+
+ vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+
+ vand d20, d20, d19 ; flat & mask
+
+ vmov.u8 d22, #0x80
+
+ vorr d23, d21, d23 ; hev
+
+ ; This instruction will truncate the "flat & mask" masks down to 4 bits
+ ; each to fit into one 32 bit arm register. The values are stored in
+ ; q10.64[0].
+ vshrn.u16 d30, q10, #4
+ vmov.u32 r4, d30[0] ; flat & mask 4bits
+
+ adds r5, r4, #1 ; Check for all 1's
+
+ ; If mask and flat are 1's for all vectors, then we only need to execute
+ ; the power branch for all vectors.
+ beq power_branch_only
+
+ cmp r4, #0 ; Check for 0, set flag for later
+
+ ; mbfilter() function
+ ; filter() function
+ ; convert to signed
+ veor d21, d7, d22 ; qs0
+ veor d24, d6, d22 ; ps0
+ veor d25, d5, d22 ; ps1
+ veor d26, d16, d22 ; qs1
+
+ vmov.u8 d27, #3
+
+ vsub.s8 d28, d21, d24 ; ( qs0 - ps0)
+
+ vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
+
+ vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
+
+ vand d29, d29, d23 ; filter &= hev
+
+ vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d29, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d28, q15
+
+ vand d28, d28, d19 ; filter &= mask
+
+ vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
+ vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
+ vshr.s8 d30, d30, #3 ; filter2 >>= 3
+ vshr.s8 d29, d29, #3 ; filter1 >>= 3
+
+ vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
+ vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1)
+
+ ; outer tap adjustments: ++filter1 >> 1
+ vrshr.s8 d29, d29, #1
+ vbic d29, d29, d23 ; filter &= ~hev
+
+ vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
+ vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
+
+ ; If mask and flat are 0's for all vectors, then we only need to execute
+ ; the filter branch for all vectors.
+ beq filter_branch_only
+
+ ; If mask and flat are mixed then we must perform both branches and
+ ; combine the data.
+ veor d24, d24, d22 ; *f_op0 = u^0x80
+ veor d21, d21, d22 ; *f_oq0 = u^0x80
+ veor d25, d25, d22 ; *f_op1 = u^0x80
+ veor d26, d26, d22 ; *f_oq1 = u^0x80
+
+ ; At this point we have already executed the filter branch. The filter
+ ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
+ ; branch and combine the data.
+ vmov.u8 d23, #2
+ vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0
+ vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3
+ vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2
+
+ vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask)
+
+ vaddw.u8 q14, d5 ; r_op2 += p1
+
+ vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask)
+
+ vqrshrn.u16 d30, q14, #3 ; r_op2
+
+ vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3
+ vsubw.u8 q14, d4 ; r_op1 -= p2
+ vaddw.u8 q14, d5 ; r_op1 += p1
+ vaddw.u8 q14, d16 ; r_op1 += q1
+
+ vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask)
+
+ vqrshrn.u16 d31, q14, #3 ; r_op1
+
+ vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3
+ vsubw.u8 q14, d5 ; r_op0 -= p1
+ vaddw.u8 q14, d6 ; r_op0 += p0
+ vaddw.u8 q14, d17 ; r_op0 += q2
+
+ vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask)
+
+ vqrshrn.u16 d23, q14, #3 ; r_op0
+
+ vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3
+ vsubw.u8 q14, d6 ; r_oq0 -= p0
+ vaddw.u8 q14, d7 ; r_oq0 += q0
+
+ vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask)
+
+ vaddw.u8 q14, d18 ; oq0 += q3
+
+ vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask)
+
+ vqrshrn.u16 d22, q14, #3 ; r_oq0
+
+ vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2
+ vsubw.u8 q14, d7 ; r_oq1 -= q0
+ vaddw.u8 q14, d16 ; r_oq1 += q1
+
+ vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask)
+
+ vaddw.u8 q14, d18 ; r_oq1 += q3
+
+ vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask)
+
+ vqrshrn.u16 d6, q14, #3 ; r_oq1
+
+ vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1
+ vsubw.u8 q14, d16 ; r_oq2 -= q1
+ vaddw.u8 q14, d17 ; r_oq2 += q2
+ vaddw.u8 q14, d18 ; r_oq2 += q3
+
+ vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask)
+
+ vqrshrn.u16 d7, q14, #3 ; r_oq2
+
+ vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask)
+ vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask)
+ vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask)
+
+ bx lr
+
+power_branch_only
+ vmov.u8 d27, #3
+ vmov.u8 d21, #2
+ vaddl.u8 q14, d6, d7 ; op2 = p0 + q0
+ vmlal.u8 q14, d3, d27 ; op2 += p3 * 3
+ vmlal.u8 q14, d4, d21 ; op2 += p2 * 2
+ vaddw.u8 q14, d5 ; op2 += p1
+ vqrshrn.u16 d0, q14, #3 ; op2
+
+ vsubw.u8 q14, d3 ; op1 = op2 - p3
+ vsubw.u8 q14, d4 ; op1 -= p2
+ vaddw.u8 q14, d5 ; op1 += p1
+ vaddw.u8 q14, d16 ; op1 += q1
+ vqrshrn.u16 d1, q14, #3 ; op1
+
+ vsubw.u8 q14, d3 ; op0 = op1 - p3
+ vsubw.u8 q14, d5 ; op0 -= p1
+ vaddw.u8 q14, d6 ; op0 += p0
+ vaddw.u8 q14, d17 ; op0 += q2
+ vqrshrn.u16 d2, q14, #3 ; op0
+
+ vsubw.u8 q14, d3 ; oq0 = op0 - p3
+ vsubw.u8 q14, d6 ; oq0 -= p0
+ vaddw.u8 q14, d7 ; oq0 += q0
+ vaddw.u8 q14, d18 ; oq0 += q3
+ vqrshrn.u16 d3, q14, #3 ; oq0
+
+ vsubw.u8 q14, d4 ; oq1 = oq0 - p2
+ vsubw.u8 q14, d7 ; oq1 -= q0
+ vaddw.u8 q14, d16 ; oq1 += q1
+ vaddw.u8 q14, d18 ; oq1 += q3
+ vqrshrn.u16 d4, q14, #3 ; oq1
+
+ vsubw.u8 q14, d5 ; oq2 = oq1 - p1
+ vsubw.u8 q14, d16 ; oq2 -= q1
+ vaddw.u8 q14, d17 ; oq2 += q2
+ vaddw.u8 q14, d18 ; oq2 += q3
+ vqrshrn.u16 d5, q14, #3 ; oq2
+
+ bx lr
+
+filter_branch_only
+ ; TODO(fgalligan): See if we can rearange registers so we do not need to
+ ; do the 2 vswp.
+ vswp d0, d4 ; op2
+ vswp d5, d17 ; oq2
+ veor d2, d24, d22 ; *op0 = u^0x80
+ veor d3, d21, d22 ; *oq0 = u^0x80
+ veor d1, d25, d22 ; *op1 = u^0x80
+ veor d4, d26, d22 ; *oq1 = u^0x80
+
+ bx lr
+
+ ENDP ; |vpx_mbloop_filter_neon|
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
new file mode 100644
index 0000000000..c54e588239
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,1107 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// For all the static inline functions, the functions ending with '_8' process
+// 8 samples in a bunch, and the functions ending with '_16' process 16 samples
+// in a bunch.
+
+#define FUN_LOAD_THRESH(w, r) \
+ static INLINE void load_thresh_##w( \
+ const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \
+ uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec, \
+ uint8x##w##_t *thresh_vec) { \
+ *blimit_vec = vld1##r##dup_u8(blimit); \
+ *limit_vec = vld1##r##dup_u8(limit); \
+ *thresh_vec = vld1##r##dup_u8(thresh); \
+ }
+
+FUN_LOAD_THRESH(8, _) // load_thresh_8
+FUN_LOAD_THRESH(16, q_) // load_thresh_16
+#undef FUN_LOAD_THRESH
+
+static INLINE void load_thresh_8_dual(
+ const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
+ uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
+ *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
+ *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
+ *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
+}
+
+// Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
+// pixel. When used to control filter branches, we only detect whether it is all
+// 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
+// flat equals 0 if and only if flat_status equals 0.
+// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
+// because each mask occupies more than 1 bit.)
+static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) {
+ return vget_lane_u32(
+ vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0);
+}
+
+// Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel.
+// When used to control filter branches, we only detect whether it is all 0s or
+// all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so
+// we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel.
+// Then we pairwise add flat to a 32-bit long number flat_status.
+// flat equals 0 if and only if flat_status equals 0.
+// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
+// because each mask occupies more than 1 bit.)
+static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
+ const uint8x8_t flat_4bit =
+ vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4));
+ return calc_flat_status_8(flat_4bit);
+}
+
+#define FUN_FILTER_HEV_MASK4(w, r) \
+ static INLINE uint8x##w##_t filter_hev_mask4_##w( \
+ const uint8x##w##_t limit, const uint8x##w##_t blimit, \
+ const uint8x##w##_t thresh, const uint8x##w##_t p3, \
+ const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+ const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+ const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \
+ uint8x##w##_t max, t0, t1; \
+ \
+ max = vabd##r##u8(p1, p0); \
+ max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \
+ *hev = vcgt##r##u8(max, thresh); \
+ *mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \
+ *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \
+ *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \
+ *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \
+ t0 = vabd##r##u8(p0, q0); \
+ t1 = vabd##r##u8(p1, q1); \
+ t0 = vqadd##r##u8(t0, t0); \
+ t1 = vshr##r##n_u8(t1, 1); \
+ t0 = vqadd##r##u8(t0, t1); \
+ *mask = vcle##r##u8(*mask, limit); \
+ t0 = vcle##r##u8(t0, blimit); \
+ *mask = vand##r##u8(*mask, t0); \
+ \
+ return max; \
+ }
+
+FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8
+FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16
+#undef FUN_FILTER_HEV_MASK4
+
+#define FUN_FILTER_FLAT_HEV_MASK(w, r) \
+ static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \
+ const uint8x##w##_t limit, const uint8x##w##_t blimit, \
+ const uint8x##w##_t thresh, const uint8x##w##_t p3, \
+ const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+ const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+ const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \
+ uint8x##w##_t *hev) { \
+ uint8x##w##_t max, mask; \
+ \
+ max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
+ q2, q3, hev, &mask); \
+ *flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \
+ *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \
+ *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \
+ *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \
+ *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */ \
+ *flat = vand##r##u8(*flat, mask); \
+ *flat_status = calc_flat_status_##w(*flat); \
+ \
+ return mask; \
+ }
+
+FUN_FILTER_FLAT_HEV_MASK(8, _) // filter_flat_hev_mask_8
+FUN_FILTER_FLAT_HEV_MASK(16, q_) // filter_flat_hev_mask_16
+#undef FUN_FILTER_FLAT_HEV_MASK
+
+#define FUN_FLAT_MASK5(w, r) \
+ static INLINE uint8x##w##_t flat_mask5_##w( \
+ const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
+ const uint8x##w##_t q4, const uint8x##w##_t flat, \
+ uint32_t *flat2_status) { \
+ uint8x##w##_t flat2 = vabd##r##u8(p4, p0); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0)); \
+ flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0)); \
+ flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1)); \
+ flat2 = vand##r##u8(flat2, flat); \
+ *flat2_status = calc_flat_status_##w(flat2); \
+ \
+ return flat2; \
+ }
+
+FUN_FLAT_MASK5(8, _) // flat_mask5_8
+FUN_FLAT_MASK5(16, q_) // flat_mask5_16
+#undef FUN_FLAT_MASK5
+
+#define FUN_FLIP_SIGN(w, r) \
+ static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \
+ const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80); \
+ return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit)); \
+ }
+
+FUN_FLIP_SIGN(8, _) // flip_sign_8
+FUN_FLIP_SIGN(16, q_) // flip_sign_16
+#undef FUN_FLIP_SIGN
+
+#define FUN_FLIP_SIGN_BACK(w, r) \
+ static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
+ const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \
+ return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \
+ }
+
+FUN_FLIP_SIGN_BACK(8, _) // flip_sign_back_8
+FUN_FLIP_SIGN_BACK(16, q_) // flip_sign_back_16
+#undef FUN_FLIP_SIGN_BACK
+
+static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1,
+ const uint8x8_t add0, const uint8x8_t add1,
+ uint16x8_t *sum) {
+ *sum = vsubw_u8(*sum, sub0);
+ *sum = vsubw_u8(*sum, sub1);
+ *sum = vaddw_u8(*sum, add0);
+ *sum = vaddw_u8(*sum, add1);
+}
+
+static INLINE void filter_update_16(const uint8x16_t sub0,
+ const uint8x16_t sub1,
+ const uint8x16_t add0,
+ const uint8x16_t add1, uint16x8_t *sum0,
+ uint16x8_t *sum1) {
+ *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0));
+ *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0));
+ *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1));
+ *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1));
+ *sum0 = vaddw_u8(*sum0, vget_low_u8(add0));
+ *sum1 = vaddw_u8(*sum1, vget_high_u8(add0));
+ *sum0 = vaddw_u8(*sum0, vget_low_u8(add1));
+ *sum1 = vaddw_u8(*sum1, vget_high_u8(add1));
+}
+
+static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0,
+ const uint8x8_t sub1,
+ const uint8x8_t add0,
+ const uint8x8_t add1,
+ uint16x8_t *sum) {
+ filter_update_8(sub0, sub1, add0, add1, sum);
+ return vrshrn_n_u16(*sum, 3);
+}
+
+static INLINE uint8x16_t calc_7_tap_filter_16_kernel(
+ const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0,
+ const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) {
+ filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
+ return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3));
+}
+
+static INLINE uint8x8_t apply_15_tap_filter_8_kernel(
+ const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1,
+ const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in,
+ uint16x8_t *sum) {
+ filter_update_8(sub0, sub1, add0, add1, sum);
+ return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in);
+}
+
+static INLINE uint8x16_t apply_15_tap_filter_16_kernel(
+ const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1,
+ const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in,
+ uint16x8_t *sum0, uint16x8_t *sum1) {
+ uint8x16_t t;
+ filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
+ t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4));
+ return vbslq_u8(flat, t, in);
+}
+
+// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2,
+ const uint8x8_t p1, const uint8x8_t p0,
+ const uint8x8_t q0, const uint8x8_t q1,
+ const uint8x8_t q2, const uint8x8_t q3,
+ uint8x8_t *op2, uint8x8_t *op1,
+ uint8x8_t *op0, uint8x8_t *oq0,
+ uint8x8_t *oq1, uint8x8_t *oq2) {
+ uint16x8_t sum;
+ sum = vaddl_u8(p3, p3); // 2*p3
+ sum = vaddw_u8(sum, p3); // 3*p3
+ sum = vaddw_u8(sum, p2); // 3*p3+p2
+ sum = vaddw_u8(sum, p2); // 3*p3+2*p2
+ sum = vaddw_u8(sum, p1); // 3*p3+2*p2+p1
+ sum = vaddw_u8(sum, p0); // 3*p3+2*p2+p1+p0
+ sum = vaddw_u8(sum, q0); // 3*p3+2*p2+p1+p0+q0
+ *op2 = vrshrn_n_u16(sum, 3);
+ *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum);
+ *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum);
+ *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum);
+ *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum);
+ *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum);
+}
+
+static INLINE void calc_7_tap_filter_16(
+ const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1,
+ const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1,
+ const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1,
+ uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) {
+ uint16x8_t sum0, sum1;
+ sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3)); // 2*p3
+ sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3)); // 2*p3
+ sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 3*p3
+ sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 3*p3
+ sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+p2
+ sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+p2
+ sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+2*p2
+ sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+2*p2
+ sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 3*p3+2*p2+p1
+ sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 3*p3+2*p2+p1
+ sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 3*p3+2*p2+p1+p0
+ sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 3*p3+2*p2+p1+p0
+ sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 3*p3+2*p2+p1+p0+q0
+ sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 3*p3+2*p2+p1+p0+q0
+ *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3));
+ *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1);
+ *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1);
+ *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1);
+ *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1);
+ *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1);
+}
+
+#define FUN_APPLY_7_TAP_FILTER(w, r) \
+ static INLINE void apply_7_tap_filter_##w( \
+ const uint8x##w##_t flat, const uint8x##w##_t p3, \
+ const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+ const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+ const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1, \
+ uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1, \
+ uint8x##w##_t *oq2) { \
+ uint8x##w##_t tp1, tp0, tq0, tq1; \
+ calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, \
+ &tq0, &tq1, oq2); \
+ *op2 = vbsl##r##u8(flat, *op2, p2); \
+ *op1 = vbsl##r##u8(flat, tp1, *op1); \
+ *op0 = vbsl##r##u8(flat, tp0, *op0); \
+ *oq0 = vbsl##r##u8(flat, tq0, *oq0); \
+ *oq1 = vbsl##r##u8(flat, tq1, *oq1); \
+ *oq2 = vbsl##r##u8(flat, *oq2, q2); \
+ }
+
+FUN_APPLY_7_TAP_FILTER(8, _) // apply_7_tap_filter_8
+FUN_APPLY_7_TAP_FILTER(16, q_) // apply_7_tap_filter_16
+#undef FUN_APPLY_7_TAP_FILTER
+
+// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+static INLINE void apply_15_tap_filter_8(
+ const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6,
+ const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3,
+ const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0,
+ const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2,
+ const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5,
+ const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5,
+ uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1,
+ uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2,
+ uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
+ uint16x8_t sum;
+ sum = vshll_n_u8(p7, 3); // 8*p7
+ sum = vsubw_u8(sum, p7); // 7*p7
+ sum = vaddw_u8(sum, p6); // 7*p7+p6
+ sum = vaddw_u8(sum, p6); // 7*p7+2*p6
+ sum = vaddw_u8(sum, p5); // 7*p7+2*p6+p5
+ sum = vaddw_u8(sum, p4); // 7*p7+2*p6+p5+p4
+ sum = vaddw_u8(sum, p3); // 7*p7+2*p6+p5+p4+p3
+ sum = vaddw_u8(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2
+ sum = vaddw_u8(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1
+ sum = vaddw_u8(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+ sum = vaddw_u8(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+ *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6);
+ *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum);
+ *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum);
+ *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum);
+ *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
+ *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
+ *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
+ *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
+ *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
+ *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
+ *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum);
+ *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum);
+ *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum);
+ *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum);
+}
+
+static INLINE void apply_15_tap_filter_16(
+ const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6,
+ const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3,
+ const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
+ const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
+ const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5,
+ const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5,
+ uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1,
+ uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2,
+ uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) {
+ uint16x8_t sum0, sum1;
+ uint8x16_t t;
+ sum0 = vshll_n_u8(vget_low_u8(p7), 3); // 8*p7
+ sum1 = vshll_n_u8(vget_high_u8(p7), 3); // 8*p7
+ sum0 = vsubw_u8(sum0, vget_low_u8(p7)); // 7*p7
+ sum1 = vsubw_u8(sum1, vget_high_u8(p7)); // 7*p7
+ sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+p6
+ sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+p6
+ sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+2*p6
+ sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+2*p6
+ sum0 = vaddw_u8(sum0, vget_low_u8(p5)); // 7*p7+2*p6+p5
+ sum1 = vaddw_u8(sum1, vget_high_u8(p5)); // 7*p7+2*p6+p5
+ sum0 = vaddw_u8(sum0, vget_low_u8(p4)); // 7*p7+2*p6+p5+p4
+ sum1 = vaddw_u8(sum1, vget_high_u8(p4)); // 7*p7+2*p6+p5+p4
+ sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 7*p7+2*p6+p5+p4+p3
+ sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 7*p7+2*p6+p5+p4+p3
+ sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2
+ sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2
+ sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1
+ sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1
+ sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+ sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+ sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+ sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+ t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4));
+ *op6 = vbslq_u8(flat2, t, p6);
+ *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1);
+ *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1);
+ *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1);
+ *op2 =
+ apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1);
+ *op1 =
+ apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1);
+ *op0 =
+ apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1);
+ *oq0 =
+ apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1);
+ *oq1 =
+ apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1);
+ *oq2 =
+ apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1);
+ *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1);
+ *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1);
+ *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1);
+ *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1);
+}
+
+#define FUN_FILTER4(w, r) \
+ static INLINE void filter4_##w( \
+ const uint8x##w##_t mask, const uint8x##w##_t hev, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0, \
+ uint8x##w##_t *oq0, uint8x##w##_t *oq1) { \
+ int8x##w##_t filter, filter1, filter2, t; \
+ int8x##w##_t ps1 = flip_sign_##w(p1); \
+ int8x##w##_t ps0 = flip_sign_##w(p0); \
+ int8x##w##_t qs0 = flip_sign_##w(q0); \
+ int8x##w##_t qs1 = flip_sign_##w(q1); \
+ \
+ /* add outer taps if we have high edge variance */ \
+ filter = vqsub##r##s8(ps1, qs1); \
+ filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev)); \
+ t = vqsub##r##s8(qs0, ps0); \
+ \
+ /* inner taps */ \
+ filter = vqadd##r##s8(filter, t); \
+ filter = vqadd##r##s8(filter, t); \
+ filter = vqadd##r##s8(filter, t); \
+ filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask)); \
+ \
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */ \
+ /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \
+ /* we'd round it by 3 the other way */ \
+ filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3); \
+ filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3); \
+ \
+ qs0 = vqsub##r##s8(qs0, filter1); \
+ ps0 = vqadd##r##s8(ps0, filter2); \
+ *oq0 = flip_sign_back_##w(qs0); \
+ *op0 = flip_sign_back_##w(ps0); \
+ \
+ /* outer tap adjustments */ \
+ filter = vrshr##r##n_s8(filter1, 1); \
+ filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev)); \
+ \
+ qs1 = vqsub##r##s8(qs1, filter); \
+ ps1 = vqadd##r##s8(ps1, filter); \
+ *oq1 = flip_sign_back_##w(qs1); \
+ *op1 = flip_sign_back_##w(ps1); \
+ }
+
+FUN_FILTER4(8, _) // filter4_8
+FUN_FILTER4(16, q_) // filter4_16
+#undef FUN_FILTER4
+
+#define FUN_FILTER8(w) \
+ static INLINE void filter8_##w( \
+ const uint8x##w##_t mask, const uint8x##w##_t flat, \
+ const uint32_t flat_status, const uint8x##w##_t hev, \
+ const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \
+ const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \
+ const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \
+ uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
+ uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \
+ if (flat_status != (uint32_t)-2) { \
+ filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \
+ *op2 = p2; \
+ *oq2 = q2; \
+ if (flat_status) { \
+ apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
+ op0, oq0, oq1, oq2); \
+ } \
+ } else { \
+ calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \
+ oq0, oq1, oq2); \
+ } \
+ }
+
+FUN_FILTER8(8) // filter8_8
+FUN_FILTER8(16) // filter8_16
+#undef FUN_FILTER8
+
+#define FUN_FILTER16(w) \
+ static INLINE void filter16_##w( \
+ const uint8x##w##_t mask, const uint8x##w##_t flat, \
+ const uint32_t flat_status, const uint8x##w##_t flat2, \
+ const uint32_t flat2_status, const uint8x##w##_t hev, \
+ const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \
+ const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
+ const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
+ const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \
+ uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \
+ uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
+ uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \
+ uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) { \
+ if (flat_status != (uint32_t)-2) { \
+ filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \
+ } \
+ \
+ if (flat_status) { \
+ *op2 = p2; \
+ *oq2 = q2; \
+ if (flat2_status != (uint32_t)-2) { \
+ apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
+ op0, oq0, oq1, oq2); \
+ } \
+ if (flat2_status) { \
+ apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \
+ q2, q3, q4, q5, q6, q7, op6, op5, op4, op3, \
+ op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \
+ oq6); \
+ } \
+ } \
+ }
+
+FUN_FILTER16(8) // filter16_8
+FUN_FILTER16(16) // filter16_16
+#undef FUN_FILTER16
+
+#define FUN_LOAD8(w, r) \
+ static INLINE void load_##w##x8( \
+ const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \
+ uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0, \
+ uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) { \
+ *p3 = vld1##r##u8(s); \
+ s += p; \
+ *p2 = vld1##r##u8(s); \
+ s += p; \
+ *p1 = vld1##r##u8(s); \
+ s += p; \
+ *p0 = vld1##r##u8(s); \
+ s += p; \
+ *q0 = vld1##r##u8(s); \
+ s += p; \
+ *q1 = vld1##r##u8(s); \
+ s += p; \
+ *q2 = vld1##r##u8(s); \
+ s += p; \
+ *q3 = vld1##r##u8(s); \
+ }
+
+FUN_LOAD8(8, _) // load_8x8
+FUN_LOAD8(16, q_) // load_16x8
+#undef FUN_LOAD8
+
+#define FUN_LOAD16(w, r) \
+ static INLINE void load_##w##x16( \
+ const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \
+ uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4, \
+ uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7, \
+ uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10, \
+ uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13, \
+ uint8x##w##_t *s14, uint8x##w##_t *s15) { \
+ *s0 = vld1##r##u8(s); \
+ s += p; \
+ *s1 = vld1##r##u8(s); \
+ s += p; \
+ *s2 = vld1##r##u8(s); \
+ s += p; \
+ *s3 = vld1##r##u8(s); \
+ s += p; \
+ *s4 = vld1##r##u8(s); \
+ s += p; \
+ *s5 = vld1##r##u8(s); \
+ s += p; \
+ *s6 = vld1##r##u8(s); \
+ s += p; \
+ *s7 = vld1##r##u8(s); \
+ s += p; \
+ *s8 = vld1##r##u8(s); \
+ s += p; \
+ *s9 = vld1##r##u8(s); \
+ s += p; \
+ *s10 = vld1##r##u8(s); \
+ s += p; \
+ *s11 = vld1##r##u8(s); \
+ s += p; \
+ *s12 = vld1##r##u8(s); \
+ s += p; \
+ *s13 = vld1##r##u8(s); \
+ s += p; \
+ *s14 = vld1##r##u8(s); \
+ s += p; \
+ *s15 = vld1##r##u8(s); \
+ }
+
+FUN_LOAD16(8, _) // load_8x16
+FUN_LOAD16(16, q_) // load_16x16
+#undef FUN_LOAD16
+
+#define FUN_STORE4(w, r) \
+ static INLINE void store_##w##x4( \
+ uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+ const uint8x##w##_t s2, const uint8x##w##_t s3) { \
+ vst1##r##u8(s, s0); \
+ s += p; \
+ vst1##r##u8(s, s1); \
+ s += p; \
+ vst1##r##u8(s, s2); \
+ s += p; \
+ vst1##r##u8(s, s3); \
+ }
+
+FUN_STORE4(8, _) // store_8x4
+FUN_STORE4(16, q_) // store_16x4
+#undef FUN_STORE4
+
+#define FUN_STORE6(w, r) \
+ static INLINE void store_##w##x6( \
+ uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+ const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
+ const uint8x##w##_t s5) { \
+ vst1##r##u8(s, s0); \
+ s += p; \
+ vst1##r##u8(s, s1); \
+ s += p; \
+ vst1##r##u8(s, s2); \
+ s += p; \
+ vst1##r##u8(s, s3); \
+ s += p; \
+ vst1##r##u8(s, s4); \
+ s += p; \
+ vst1##r##u8(s, s5); \
+ }
+
+FUN_STORE6(8, _) // store_8x6
+FUN_STORE6(16, q_) // store_16x6
+#undef FUN_STORE6
+
+static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
+ const uint8x8_t p0, const uint8x8_t q0,
+ const uint8x8_t q1) {
+ uint8x8x4_t o;
+
+ o.val[0] = p1;
+ o.val[1] = p0;
+ o.val[2] = q0;
+ o.val[3] = q1;
+ vst4_lane_u8(s, o, 0);
+ s += p;
+ vst4_lane_u8(s, o, 1);
+ s += p;
+ vst4_lane_u8(s, o, 2);
+ s += p;
+ vst4_lane_u8(s, o, 3);
+ s += p;
+ vst4_lane_u8(s, o, 4);
+ s += p;
+ vst4_lane_u8(s, o, 5);
+ s += p;
+ vst4_lane_u8(s, o, 6);
+ s += p;
+ vst4_lane_u8(s, o, 7);
+}
+
+static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
+ const uint8x8_t s1, const uint8x8_t s2,
+ const uint8x8_t s3, const uint8x8_t s4,
+ const uint8x8_t s5) {
+ uint8x8x3_t o0, o1;
+
+ o0.val[0] = s0;
+ o0.val[1] = s1;
+ o0.val[2] = s2;
+ o1.val[0] = s3;
+ o1.val[1] = s4;
+ o1.val[2] = s5;
+ vst3_lane_u8(s - 3, o0, 0);
+ vst3_lane_u8(s + 0, o1, 0);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 1);
+ vst3_lane_u8(s + 0, o1, 1);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 2);
+ vst3_lane_u8(s + 0, o1, 2);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 3);
+ vst3_lane_u8(s + 0, o1, 3);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 4);
+ vst3_lane_u8(s + 0, o1, 4);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 5);
+ vst3_lane_u8(s + 0, o1, 5);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 6);
+ vst3_lane_u8(s + 0, o1, 6);
+ s += p;
+ vst3_lane_u8(s - 3, o0, 7);
+ vst3_lane_u8(s + 0, o1, 7);
+}
+
+#define FUN_STORE8(w, r) \
+ static INLINE void store_##w##x8( \
+ uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+ const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
+ const uint8x##w##_t s5, const uint8x##w##_t s6, \
+ const uint8x##w##_t s7) { \
+ vst1##r##u8(s, s0); \
+ s += p; \
+ vst1##r##u8(s, s1); \
+ s += p; \
+ vst1##r##u8(s, s2); \
+ s += p; \
+ vst1##r##u8(s, s3); \
+ s += p; \
+ vst1##r##u8(s, s4); \
+ s += p; \
+ vst1##r##u8(s, s5); \
+ s += p; \
+ vst1##r##u8(s, s6); \
+ s += p; \
+ vst1##r##u8(s, s7); \
+ }
+
+FUN_STORE8(8, _) // store_8x8
+FUN_STORE8(16, q_) // store_16x8
+#undef FUN_STORE8
+
+#define FUN_STORE14(w, r) \
+ static INLINE void store_##w##x14( \
+ uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
+ const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
+ const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
+ const uint32_t flat_status, const uint32_t flat2_status) { \
+ if (flat_status) { \
+ if (flat2_status) { \
+ vst1##r##u8(s - 7 * p, p6); \
+ vst1##r##u8(s - 6 * p, p5); \
+ vst1##r##u8(s - 5 * p, p4); \
+ vst1##r##u8(s - 4 * p, p3); \
+ vst1##r##u8(s + 3 * p, q3); \
+ vst1##r##u8(s + 4 * p, q4); \
+ vst1##r##u8(s + 5 * p, q5); \
+ vst1##r##u8(s + 6 * p, q6); \
+ } \
+ vst1##r##u8(s - 3 * p, p2); \
+ vst1##r##u8(s + 2 * p, q2); \
+ } \
+ vst1##r##u8(s - 2 * p, p1); \
+ vst1##r##u8(s - 1 * p, p0); \
+ vst1##r##u8(s + 0 * p, q0); \
+ vst1##r##u8(s + 1 * p, q1); \
+ }
+
+FUN_STORE14(8, _) // store_8x14
+FUN_STORE14(16, q_) // store_16x14
+#undef FUN_STORE14
+
+static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
+ const uint8x16_t s1, const uint8x16_t s2,
+ const uint8x16_t s3, const uint8x16_t s4,
+ const uint8x16_t s5, const uint8x16_t s6,
+ const uint8x16_t s7, const uint8x16_t s8,
+ const uint8x16_t s9, const uint8x16_t s10,
+ const uint8x16_t s11, const uint8x16_t s12,
+ const uint8x16_t s13, const uint8x16_t s14,
+ const uint8x16_t s15) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+ s += p;
+ vst1q_u8(s, s4);
+ s += p;
+ vst1q_u8(s, s5);
+ s += p;
+ vst1q_u8(s, s6);
+ s += p;
+ vst1q_u8(s, s7);
+ s += p;
+ vst1q_u8(s, s8);
+ s += p;
+ vst1q_u8(s, s9);
+ s += p;
+ vst1q_u8(s, s10);
+ s += p;
+ vst1q_u8(s, s11);
+ s += p;
+ vst1q_u8(s, s12);
+ s += p;
+ vst1q_u8(s, s13);
+ s += p;
+ vst1q_u8(s, s14);
+ s += p;
+ vst1q_u8(s, s15);
+}
+
+#define FUN_HOR_4_KERNEL(name, w) \
+ static INLINE void lpf_horizontal_4##name##kernel( \
+ uint8_t *s, const int p, const uint8x##w##_t blimit, \
+ const uint8x##w##_t limit, const uint8x##w##_t thresh) { \
+ uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \
+ \
+ load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \
+ filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
+ q3, &hev, &mask); \
+ filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \
+ store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \
+ }
+
+FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel
+FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel
+#undef FUN_HOR_4_KERNEL
+
+void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
+}
+
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec;
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
+}
+
+void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ mask, hev;
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+ q2, q3, &hev, &mask);
+ filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
+ store_4x8(s - 2, p, p1, p0, q0, q1);
+}
+
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ mask, hev;
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ s15;
+
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
+ &s11, &s12, &s13, &s14, &s15);
+ transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+ q2, q3, &hev, &mask);
+ filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
+ s -= 2;
+ store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
+ vget_low_u8(q1));
+ store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
+ vget_high_u8(q1));
+}
+
+void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint32_t flat_status;
+
+ load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ // Note: transpose + store_8x8() is faster than store_6x8().
+ transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
+ store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
+}
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+ op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ s15;
+ uint32_t flat_status;
+
+ load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+ &blimit_vec, &limit_vec, &thresh_vec);
+ load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
+ &s11, &s12, &s13, &s14, &s15);
+ transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+ p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+ filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2);
+ // Note: store_6x8() twice is faster than transpose + store_8x16().
+ store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
+ vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
+ store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
+ vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
+ vget_high_u8(oq2));
+}
+
+#define FUN_LPF_16_KERNEL(name, w) \
+ static INLINE void lpf_16##name##kernel( \
+ const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \
+ const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \
+ const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
+ const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+ const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
+ const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
+ const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \
+ uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \
+ uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
+ uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \
+ uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6, \
+ uint32_t *flat_status, uint32_t *flat2_status) { \
+ uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev; \
+ \
+ load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec, \
+ &thresh_vec); \
+ mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \
+ p1, p0, q0, q1, q2, q3, &flat, \
+ flat_status, &hev); \
+ flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, \
+ flat2_status); \
+ filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6, \
+ p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, \
+ op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \
+ oq6); \
+ }
+
+FUN_LPF_16_KERNEL(_, 8) // lpf_16_kernel
+FUN_LPF_16_KERNEL(_dual_, 16) // lpf_16_dual_kernel
+#undef FUN_LPF_16_KERNEL
+
+// Quiet warnings of the form: 'vpx_dsp/arm/loopfilter_neon.c|981 col 42|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
+ op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+ uint32_t flat_status, flat2_status;
+
+ load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
+ &q3, &q4, &q5, &q6, &q7);
+ lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
+ q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
+ &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
+ &flat2_status);
+ store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
+ oq5, oq6, flat_status, flat2_status);
+}
+
+void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
+ op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+ uint32_t flat_status, flat2_status;
+
+ load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+ p7 = vld1q_u8(s - 8 * p);
+ p6 = vld1q_u8(s - 7 * p);
+ p5 = vld1q_u8(s - 6 * p);
+ p4 = vld1q_u8(s - 5 * p);
+ q4 = vld1q_u8(s + 4 * p);
+ q5 = vld1q_u8(s + 5 * p);
+ q6 = vld1q_u8(s + 6 * p);
+ q7 = vld1q_u8(s + 7 * p);
+ lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
+ q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+ &flat_status, &flat2_status);
+ store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
+ oq5, oq6, flat_status, flat2_status);
+}
+
+void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
+ op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+ uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint32_t flat_status, flat2_status;
+
+ s -= 8;
+ load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3,
+ &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
+ q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
+ &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
+ &flat2_status);
+ if (flat_status) {
+ if (flat2_status) {
+ transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
+ oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
+ &s6, &s7);
+ store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
+ } else {
+ // Note: transpose + store_8x8() is faster than store_6x8().
+ transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
+ store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
+ }
+ } else {
+ store_4x8(s + 6, p, op1, op0, oq0, oq1);
+ }
+}
+
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
+ op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+ uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+ s15;
+ uint32_t flat_status, flat2_status;
+
+ s -= 8;
+ load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11,
+ &s12, &s13, &s14, &s15);
+ transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+ s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1,
+ &q2, &q3, &q4, &q5, &q6, &q7);
+ lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
+ q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
+ &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+ &flat_status, &flat2_status);
+ if (flat_status) {
+ if (flat2_status) {
+ transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
+ oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
+ &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14,
+ &s15);
+ store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+ s13, s14, s15);
+ } else {
+ // Note: store_6x8() twice is faster than transpose + store_8x16().
+ s += 8;
+ store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
+ vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
+ store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
+ vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
+ vget_high_u8(oq2));
+ }
+ } else {
+ s += 6;
+ store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0),
+ vget_low_u8(oq1));
+ store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0),
+ vget_high_u8(oq0), vget_high_u8(oq1));
+ }
+}
+
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
new file mode 100644
index 0000000000..1a20da70ef
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_
+#define VPX_VPX_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Support for these xN intrinsics is lacking in older versions of GCC.
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ < 8 || defined(__arm__)
+static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
+ uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+ return res;
+}
+#endif
+
+#if __GNUC__ < 9 || defined(__arm__)
+static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) {
+ uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+ vld1q_u8(ptr + 2 * 16) } };
+ return res;
+}
+#endif
+#endif
+
+static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
+ const int16_t c2, const int16_t c3) {
+ return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
+ ((int64_t)(uint16_t)c2 << 32) | ((int64_t)c3 << 48));
+}
+
+static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) {
+ return vcreate_s32((uint32_t)c0 | ((int64_t)(uint32_t)c1 << 32));
+}
+
+static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1,
+ const int32_t c2, const int32_t c3) {
+ return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3));
+}
+
+// Helper functions used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4x2_t v0 = vld2q_s32(buf);
+ const int32x4x2_t v1 = vld2q_s32(buf + 8);
+ const int16x4_t s0 = vmovn_s32(v0.val[0]);
+ const int16x4_t s1 = vmovn_s32(v0.val[1]);
+ const int16x4_t s2 = vmovn_s32(v1.val[0]);
+ const int16x4_t s3 = vmovn_s32(v1.val[1]);
+ int16x8x2_t res;
+ res.val[0] = vcombine_s16(s0, s2);
+ res.val[1] = vcombine_s16(s1, s3);
+ return res;
+#else
+ return vld2q_s16(buf);
+#endif
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vld1q_s32(buf);
+ const int32x4_t v1 = vld1q_s32(buf + 4);
+ const int16x4_t s0 = vmovn_s32(v0);
+ const int16x4_t s1 = vmovn_s32(v1);
+ return vcombine_s16(s0, s1);
+#else
+ return vld1q_s16(buf);
+#endif
+}
+
+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vld1q_s32(buf);
+ return vmovn_s32(v0);
+#else
+ return vld1_s16(buf);
+#endif
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+ const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+ vst1q_s32(buf, v0);
+ vst1q_s32(buf + 4, v1);
+#else
+ vst1q_s16(buf, a);
+#endif
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) {
+ vst1q_s32(buf, a);
+}
+
+static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) {
+ return vld1q_s32(buf);
+}
+#endif
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of uint32_t (4 bytes) and add alignment hints
+// to the memory access.
+//
+// This is used for functions operating on uint8_t which wish to load or store 4
+// values at a time but which may not be on 4 byte boundaries.
+static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
+ memcpy(buf, &a, 4);
+}
+
+// Load 4 contiguous bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
+ uint32_t a;
+ uint32x2_t a_u32;
+ memcpy(&a, buf, 4);
+ a_u32 = vdup_n_u32(0);
+ a_u32 = vset_lane_u32(a, a_u32, 0);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 4 contiguous bytes and replicate across a vector when alignment is not
+// guaranteed.
+static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) {
+ uint32_t a;
+ memcpy(&a, buf, 4);
+ return vreinterpret_u8_u32(vdup_n_u32(a));
+}
+
+// Store 4 contiguous bytes from the low half of an 8x8 vector.
+static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) {
+ vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0);
+}
+
+// Store 4 contiguous bytes from the high half of an 8x8 vector.
+static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
+ vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1);
+}
+
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
+ ptrdiff_t stride) {
+ uint32_t a;
+ uint32x2_t a_u32;
+ if (stride == 4) return vld1_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdup_n_u32(a);
+ memcpy(&a, buf, 4);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 8 bytes when alignment is not guaranteed.
+static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
+ uint64_t a;
+ uint64x1_t a_u64 = vdup_n_u64(0);
+ memcpy(&a, buf, 8);
+ a_u64 = vset_lane_u64(a, a_u64, 0);
+ return vreinterpret_u16_u64(a_u64);
+}
+
+// Load 2 sets of 8 bytes when alignment is not guaranteed.
+static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
+ ptrdiff_t stride) {
+ uint64_t a;
+ uint64x2_t a_u64;
+ if (stride == 4) return vld1q_u16(buf);
+ memcpy(&a, buf, 8);
+ buf += stride;
+ a_u64 = vdupq_n_u64(a);
+ memcpy(&a, buf, 8);
+ a_u64 = vsetq_lane_u64(a, a_u64, 1);
+ return vreinterpretq_u16_u64(a_u64);
+}
+
+// Store 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
+ const uint8x8_t a) {
+ const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+ if (stride == 4) {
+ vst1_u8(buf, a);
+ return;
+ }
+ uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
+ buf += stride;
+ uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
+}
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
+ ptrdiff_t stride) {
+ uint32_t a;
+ uint32x4_t a_u32;
+ if (stride == 4) return vld1q_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdupq_n_u32(a);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 2);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 3);
+ return vreinterpretq_u8_u32(a_u32);
+}
+
+// Store 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
+ const uint8x16_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
+ if (stride == 4) {
+ vst1q_u8(buf, a);
+ return;
+ }
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
+ buf += stride;
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
+ buf += stride;
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
+ buf += stride;
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
+}
+
+// Load 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
+ uint32x2_t a = vdup_n_u32(0);
+
+ assert(!((intptr_t)buf % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ a = vld1_lane_u32((const uint32_t *)buf, a, 0);
+ buf += stride;
+ a = vld1_lane_u32((const uint32_t *)buf, a, 1);
+ return vreinterpret_u8_u32(a);
+}
+
+// Store 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
+ uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+
+ assert(!((intptr_t)buf % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ vst1_lane_u32((uint32_t *)buf, a_u32, 0);
+ buf += stride;
+ vst1_lane_u32((uint32_t *)buf, a_u32, 1);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
+ const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
+ const uint8x16_t s0, const uint8x16_t s1,
+ const uint8x16_t s2, const uint8x16_t s3) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+ const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3,
+ const uint8x8_t s4, const uint8x8_t s5,
+ const uint8x8_t s6, const uint8x8_t s7) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+ s += p;
+ vst1_u8(s, s4);
+ s += p;
+ vst1_u8(s, s5);
+ s += p;
+ vst1_u8(s, s6);
+ s += p;
+ vst1_u8(s, s7);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3,
+ uint8x16_t *const s4, uint8x16_t *const s5,
+ uint8x16_t *const s6, uint8x16_t *const s7) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+ s += p;
+ *s4 = vld1q_u8(s);
+ s += p;
+ *s5 = vld1q_u8(s);
+ s += p;
+ *s6 = vld1q_u8(s);
+ s += p;
+ *s7 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
+ const uint8x16_t s0, const uint8x16_t s1,
+ const uint8x16_t s2, const uint8x16_t s3,
+ const uint8x16_t s4, const uint8x16_t s5,
+ const uint8x16_t s6, const uint8x16_t s7) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+ s += p;
+ vst1q_u8(s, s4);
+ s += p;
+ vst1q_u8(s, s5);
+ s += p;
+ vst1q_u8(s, s6);
+ s += p;
+ vst1q_u8(s, s7);
+}
+
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+ uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+ uint16x8_t *s6, uint16x8_t *s7) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+ s += p;
+ *s7 = vld1q_u16(s);
+}
+
+#endif // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c
new file mode 100644
index 0000000000..5a76065549
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
+ const int16x8_t dequant,
+ tran_low_t *dqcoeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t dqcoeff_0 =
+ vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+ const int32x4_t dqcoeff_1 =
+ vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+#else
+ vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE int16x8_t
+quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+ const int16x8_t round, const int16x8_t quant,
+ const int16x8_t quant_shift, const int16x8_t dequant) {
+ // Load coeffs as 8 x 16-bit ints, take sign and abs values
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ // Calculate mask of elements outside the bin
+ const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ // Get the rounded values
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+ qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ // Only keep the relevant coeffs
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
+
+ return qcoeff;
+}
+
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+
+ // Only the first element of each vector is DC.
+ int16x8_t zbin = vld1q_s16(zbin_ptr);
+ int16x8_t round = vld1q_s16(round_ptr);
+ int16x8_t quant = vld1q_s16(quant_ptr);
+ int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+ int16x8_t dequant = vld1q_s16(dequant_ptr);
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
+ quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ n_coeffs -= 8;
+
+ {
+ zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+ round = vdupq_lane_s16(vget_low_s16(round), 1);
+ quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+ quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+ dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
+
+ do {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ n_coeffs -= 8;
+ } while (n_coeffs > 0);
+ }
+
+#if VPX_ARCH_AARCH64
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // VPX_ARCH_AARCH64
+ // Need these here, else the compiler complains about mixing declarations and
+ // code in C90
+ (void)scan;
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
+ const int16x8_t dequant,
+ tran_low_t *dqcoeff_ptr) {
+ int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+ int32x4_t dqcoeff_1 =
+ vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+ dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+ vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+ vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+#else
+ vst1q_s16(dqcoeff_ptr,
+ vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE int16x8_t
+quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+ const int16x8_t round, const int16x8_t quant,
+ const int16x8_t quant_shift, const int16x8_t dequant) {
+ // Load coeffs as 8 x 16-bit ints, take sign and abs values
+ const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+ const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+ // Calculate mask of elements outside the bin
+ const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+ // Get the rounded values
+ const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+ // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+ int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+ qcoeff = vaddq_s16(qcoeff, rounded);
+
+ // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+ qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+ // Restore the sign bit.
+ qcoeff = veorq_s16(qcoeff, coeff_sign);
+ qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+ // Only keep the relevant coeffs
+ qcoeff = vandq_s16(qcoeff, zbin_mask);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
+
+ return qcoeff;
+}
+
+// Main difference is that zbin values are halved before comparison and dqcoeff
+// values are divided by 2. zbin is rounded but dqcoeff is not.
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
+ const struct macroblock_plane *mb_plane,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const struct ScanOrder *scan_order) {
+ const int16x8_t neg_one = vdupq_n_s16(-1);
+ uint16x8_t eob_max;
+ int i;
+ const int16_t *iscan = scan_order->iscan;
+
+ // Only the first element of each vector is DC.
+ int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
+ int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
+ int16x8_t quant = vld1q_s16(mb_plane->quant);
+ int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
+ int16x8_t dequant = vld1q_s16(dequant_ptr);
+
+ // Process first 8 values which include a dc component.
+ {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ {
+ zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+ round = vdupq_lane_s16(vget_low_s16(round), 1);
+ quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+ quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+ dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
+
+ for (i = 1; i < 32 * 32 / 8; ++i) {
+ const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+ const int16x8_t qcoeff =
+ quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+ quant, quant_shift, dequant);
+
+ // Set non-zero elements to -1 and use that to extract values for eob.
+ eob_max =
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+ __builtin_prefetch(coeff_ptr + 64);
+ coeff_ptr += 8;
+ iscan += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+ }
+
+#if VPX_ARCH_AARCH64
+ *eob_ptr = vmaxvq_u16(eob_max);
+#else
+ {
+ const uint16x4_t eob_max_0 =
+ vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+ const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+ const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+ vst1_lane_u16(eob_ptr, eob_max_2, 0);
+ }
+#endif // VPX_ARCH_AARCH64
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c
new file mode 100644
index 0000000000..3a548d0f9f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+ uint32x4_t *const sad_sum) {
+ uint8x16_t abs_diff = vabdq_u8(src, ref);
+ *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ uint8x16_t s0, s1, s2, s3;
+
+ s0 = vld1q_u8(src + i * src_stride);
+ sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+ s1 = vld1q_u8(src + i * src_stride + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+ s2 = vld1q_u8(src + i * src_stride + 32);
+ sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+ sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+ sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+ sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+ s3 = vld1q_u8(src + i * src_stride + 48);
+ sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+ sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+ sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+ sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+ i++;
+ } while (i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ uint8x16_t s0, s1;
+
+ s0 = vld1q_u8(src + i * src_stride);
+ sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+ s1 = vld1q_u8(src + i * src_stride + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+ i++;
+ } while (i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src + i * src_stride);
+ sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+ i++;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD))
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+ uint16x8_t *const sad_sum) {
+ uint8x16_t abs_diff = vabdq_u8(src, ref);
+ *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
+}
+
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ uint8x16_t s0, s1, s2, s3;
+
+ s0 = vld1q_u8(src + i * src_stride);
+ sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+ s1 = vld1q_u8(src + i * src_stride + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+ s2 = vld1q_u8(src + i * src_stride + 32);
+ sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+ sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+ sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+ sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+ s3 = vld1q_u8(src + i * src_stride + 48);
+ sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+ sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+ sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+ sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+ i++;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ uint8x16_t s0, s1;
+
+ s0 = vld1q_u8(src + i * src_stride);
+ sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+ s1 = vld1q_u8(src + i * src_stride + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+ i++;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src + i * src_stride);
+ sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+ i++;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
+ uint16x8_t *const sad_sum) {
+ uint8x8_t abs_diff = vabd_u8(src, ref);
+ *sad_sum = vaddw_u8(*sad_sum, abs_diff);
+}
+
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ const uint8x8_t s = vld1_u8(src + i * src_stride);
+ sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
+ sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
+ sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
+ sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
+
+ i++;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
+}
+
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
+ uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
+
+ sad8_neon(s, r0, &sum[0]);
+ sad8_neon(s, r1, &sum[1]);
+ sad8_neon(s, r2, &sum[2]);
+ sad8_neon(s, r3, &sum[3]);
+
+ i += 2;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
+}
+
+#define SAD_WXH_4D_NEON(w, h) \
+ void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \
+ (h)); \
+ }
+
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+
+#undef SAD_WXH_4D_NEON
+
+#define SAD_SKIP_WXH_4D_NEON(w, h) \
+ void vpx_sad_skip_##w##x##h##x4d_neon( \
+ const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], int ref_stride, \
+ uint32_t sad_array[4]) { \
+ sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \
+ sad_array, ((h) >> 1)); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+SAD_SKIP_WXH_4D_NEON(4, 4)
+SAD_SKIP_WXH_4D_NEON(4, 8)
+
+SAD_SKIP_WXH_4D_NEON(8, 4)
+SAD_SKIP_WXH_4D_NEON(8, 8)
+SAD_SKIP_WXH_4D_NEON(8, 16)
+
+SAD_SKIP_WXH_4D_NEON(16, 8)
+SAD_SKIP_WXH_4D_NEON(16, 16)
+SAD_SKIP_WXH_4D_NEON(16, 32)
+
+SAD_SKIP_WXH_4D_NEON(32, 16)
+SAD_SKIP_WXH_4D_NEON(32, 32)
+SAD_SKIP_WXH_4D_NEON(32, 64)
+
+SAD_SKIP_WXH_4D_NEON(64, 32)
+SAD_SKIP_WXH_4D_NEON(64, 64)
+
+#undef SAD_SKIP_WXH_4D_NEON
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c
new file mode 100644
index 0000000000..566a1f81db
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c
@@ -0,0 +1,570 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sadwxh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int w, int h) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ return sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32;
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
+ uint8x16_t diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ diff2 = vabdq_u8(s2, r2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ diff3 = vabdq_u8(s3, r3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_uint32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t diff0 = vabdq_u8(s0, r0);
+ uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t diff1 = vabdq_u8(s1, r1);
+ uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+ sum = vpadalq_u16(sum, sum0);
+ sum = vpadalq_u16(sum, sum1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(sum);
+}
+
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+
+ sum = vabal_u8(sum, s, r);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+
+ sum = vabal_u8(sum, s, r);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+#define SAD_WXH_NEON(w, h) \
+ unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
+ }
+
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h) \
+ unsigned int vpx_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * \
+ sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
+ }
+
+SAD_SKIP_WXH_NEON(4, 4)
+SAD_SKIP_WXH_NEON(4, 8)
+
+SAD_SKIP_WXH_NEON(8, 4)
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
+
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+
+#undef SAD_SKIP_WXH_NEON
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ second_pred += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+ second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+ second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ p1 = vld1q_u8(second_pred);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32;
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+ uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ p2 = vld1q_u8(second_pred + 32);
+ avg2 = vrhaddq_u8(r2, p2);
+ diff2 = vabdq_u8(s2, avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ p3 = vld1q_u8(second_pred + 48);
+ avg3 = vrhaddq_u8(r3, p3);
+ diff3 = vabdq_u8(s3, avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--i != 0);
+
+ sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_uint32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t avg0 = vrhaddq_u8(r0, p0);
+ uint8x16_t diff0 = vabdq_u8(s0, avg0);
+ uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t avg1 = vrhaddq_u8(r1, p1);
+ uint8x16_t diff1 = vabdq_u8(s1, avg1);
+ uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+ sum = vpadalq_u16(sum, sum0);
+ sum = vpadalq_u16(sum, sum1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--i != 0);
+
+ return horizontal_add_uint32x4(sum);
+}
+
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = vrhaddq_u8(r, p);
+ uint8x16_t diff = vabdq_u8(s, avg);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t avg = vrhadd_u8(r, p);
+ sum = vabal_u8(sum, s, avg);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t avg = vrhadd_u8(r, p);
+ sum = vabal_u8(sum, s, avg);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_uint16x8(sum);
+}
+
+#define SAD_WXH_AVG_NEON(w, h) \
+ uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+SAD_WXH_AVG_NEON(4, 4)
+SAD_WXH_AVG_NEON(4, 8)
+
+SAD_WXH_AVG_NEON(8, 4)
+SAD_WXH_AVG_NEON(8, 8)
+SAD_WXH_AVG_NEON(8, 16)
+
+SAD_WXH_AVG_NEON(16, 8)
+SAD_WXH_AVG_NEON(16, 16)
+SAD_WXH_AVG_NEON(16, 32)
+
+SAD_WXH_AVG_NEON(32, 16)
+SAD_WXH_AVG_NEON(32, 32)
+SAD_WXH_AVG_NEON(32, 64)
+
+SAD_WXH_AVG_NEON(64, 32)
+SAD_WXH_AVG_NEON(64, 64)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm
new file mode 100644
index 0000000000..9811cd5a5a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/save_reg_neon.asm
@@ -0,0 +1,34 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vpx_push_neon|
+ EXPORT |vpx_pop_neon|
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_push_neon| PROC
+ vstm r0!, {d8-d15}
+ bx lr
+
+ ENDP
+
+|vpx_pop_neon| PROC
+ vldm r0!, {d8-d15}
+ bx lr
+
+ ENDP
+
+ END
+
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 0000000000..9328c3ed89
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#include "vpx_dsp/variance.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+// Process a block exactly 8 wide and any height.
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ } while (--i != 0);
+}
+
+// Process a block which is a mutiple of 16 wide and any height.
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l =
+ vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+ uint16x8_t blend_h =
+ vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+ uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3);
+ uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3);
+ vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi));
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+ dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+ dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
+ sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
+ yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ second_pred += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ second_pred += 8;
+ } while (--i > 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l =
+ vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+ uint16x8_t blend_h =
+ vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height,
+ const uint8_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ avg = vrhaddq_u8(avg, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of vpx_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+ int dst_width, int dst_height,
+ const uint8_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr + j);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = vrhaddq_u8(s, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+ xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ if (xoffset == 0) { \
+ uint8_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ avg_pred(src, tmp, source_stride, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
+ source_stride, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \
+ xoffset, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c
new file mode 100644
index 0000000000..2c008e48ab
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ int r = rows, c;
+
+ if (cols > 16) {
+ do {
+ for (c = 0; c < cols; c += 32) {
+ const uint8x16_t s0 = vld1q_u8(&src[c + 0]);
+ const uint8x16_t s1 = vld1q_u8(&src[c + 16]);
+ const uint8x16_t p0 = vld1q_u8(&pred[c + 0]);
+ const uint8x16_t p1 = vld1q_u8(&pred[c + 16]);
+ const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0));
+ const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0));
+ const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1));
+ const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1));
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1));
+ vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2));
+ vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3));
+ }
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols > 8) {
+ do {
+ const uint8x16_t s = vld1q_u8(&src[0]);
+ const uint8x16_t p = vld1q_u8(&pred[0]);
+ const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p));
+ const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p));
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols > 4) {
+ do {
+ const uint8x8_t s = vld1_u8(&src[0]);
+ const uint8x8_t p = vld1_u8(&pred[0]);
+ const uint16x8_t v_diff = vsubl_u8(s, p);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else {
+ assert(cols == 4);
+ do {
+ const uint8x8_t s = load_unaligned_u8(src, (int)src_stride);
+ const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride);
+ const uint16x8_t d = vsubl_u8(s, p);
+ vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d)));
+ vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d)));
+ diff += 2 * diff_stride;
+ pred += 2 * pred_stride;
+ src += 2 * src_stride;
+ r -= 2;
+ } while (r);
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride,
+ const uint8_t *src8_ptr,
+ ptrdiff_t src_stride,
+ const uint8_t *pred8_ptr,
+ ptrdiff_t pred_stride, int bd) {
+ int r = rows, c;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
+ (void)bd;
+
+ if (cols >= 16) {
+ do {
+ for (c = 0; c < cols; c += 16) {
+ const uint16x8_t s0 = vld1q_u16(&src[c + 0]);
+ const uint16x8_t s1 = vld1q_u16(&src[c + 8]);
+ const uint16x8_t p0 = vld1q_u16(&pred[c + 0]);
+ const uint16x8_t p1 = vld1q_u16(&pred[c + 8]);
+ const uint16x8_t d0 = vsubq_u16(s0, p0);
+ const uint16x8_t d1 = vsubq_u16(s1, p1);
+ vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0));
+ vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols >= 8) {
+ do {
+ for (c = 0; c < cols; c += 8) {
+ const uint16x8_t s = vld1q_u16(&src[c]);
+ const uint16x8_t p = vld1q_u16(&pred[c]);
+ const uint16x8_t d0 = vsubq_u16(s, p);
+ vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ } else if (cols >= 4) {
+ do {
+ for (c = 0; c < cols; c += 4) {
+ const uint16x4_t s = vld1_u16(&src[c]);
+ const uint16x4_t p = vld1_u16(&pred[c]);
+ const uint16x4_t v_diff = vsub_u16(s, p);
+ vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff));
+ }
+ diff_ptr += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h
new file mode 100644
index 0000000000..48a2fc05ca
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_SUM_NEON_H_
+#define VPX_VPX_DSP_ARM_SUM_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlv_u8(a);
+#else
+ const uint16x4_t b = vpaddl_u8(a);
+ const uint16x4_t c = vpadd_u16(b, b);
+ return vget_lane_u16(c, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlv_u8(a);
+#else
+ const uint16x4_t b = vpaddl_u8(a);
+ const uint16x4_t c = vpadd_u16(b, b);
+ const uint16x4_t d = vpadd_u16(c, c);
+ return vget_lane_u16(d, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_u8(a);
+#else
+ const uint16x8_t b = vpaddlq_u8(a);
+ const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b));
+ const uint16x4_t d = vpadd_u16(c, c);
+ const uint16x4_t e = vpadd_u16(d, d);
+ return vget_lane_u16(e, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddv_u16(a);
+#else
+ const uint16x4_t b = vpadd_u16(a, a);
+ const uint16x4_t c = vpadd_u16(b, b);
+ return vget_lane_u16(c, 0);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_s16(a);
+#else
+ const int32x4_t b = vpaddlq_s16(a);
+ const int64x2_t c = vpaddlq_s32(b);
+ const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+ vreinterpret_s32_s64(vget_high_s64(c)));
+ return vget_lane_s32(d, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_u16(a);
+#else
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+ return vget_lane_u32(d, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
+#if VPX_ARCH_AARCH64
+ const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+ const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+ const uint16x8_t b0 = vpaddq_u16(a0, a1);
+ return vpaddlq_u16(b0);
+#else
+ const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+ const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+ const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+ const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+ const uint16x4_t b0 = vpadd_u16(a0, a1);
+ const uint16x4_t b1 = vpadd_u16(a2, a3);
+ return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
+static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
+ const uint16x8_t vec_hi) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+ const uint32x4_t vec_l_lo =
+ vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi =
+ vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+ const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
+ const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
+ const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
+ const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
+ const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
+ const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
+ const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
+ const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
+ const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
+ const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
+#if VPX_ARCH_AARCH64
+ const uint32x4_t c0 = vpaddq_u32(b0, b1);
+ const uint32x4_t c1 = vpaddq_u32(b2, b3);
+ return vpaddq_u32(c0, c1);
+#else
+ const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+ const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+ const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+ const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+ const uint32x2_t d0 = vpadd_u32(c0, c1);
+ const uint32x2_t d1 = vpadd_u32(c2, c3);
+ return vcombine_u32(d0, d1);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddv_s32(a);
+#else
+ return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddv_u32(a);
+#else
+ return vget_lane_u32(a, 0) + vget_lane_u32(a, 1);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddvq_s32(a);
+#else
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
+#if VPX_ARCH_AARCH64
+ uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+ uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+ return vpaddq_u32(res01, res23);
+#else
+ uint32x4_t res = vdupq_n_u32(0);
+ res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
+ res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
+ res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
+ res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
+ return res;
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
+#endif
+}
+
+static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddvq_s64(a);
+#else
+ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
+#if VPX_ARCH_AARCH64
+ return vaddvq_u64(a);
+#else
+ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
+#endif // VPX_VPX_DSP_ARM_SUM_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c
new file mode 100644
index 0000000000..074afe3258
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) {
+ if (size == 4) {
+ int16x4_t s[4];
+ int32x4_t sum_s32;
+
+ s[0] = vld1_s16(src + 0 * stride);
+ s[1] = vld1_s16(src + 1 * stride);
+ s[2] = vld1_s16(src + 2 * stride);
+ s[3] = vld1_s16(src + 3 * stride);
+
+ sum_s32 = vmull_s16(s[0], s[0]);
+ sum_s32 = vmlal_s16(sum_s32, s[1], s[1]);
+ sum_s32 = vmlal_s16(sum_s32, s[2], s[2]);
+ sum_s32 = vmlal_s16(sum_s32, s[3], s[3]);
+
+ return horizontal_long_add_uint32x4(vreinterpretq_u32_s32(sum_s32));
+ } else {
+ uint64x2_t sum_u64 = vdupq_n_u64(0);
+ int rows = size;
+
+ do {
+ const int16_t *src_ptr = src;
+ int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int cols = size;
+
+ do {
+ int16x8_t s[8];
+
+ s[0] = vld1q_s16(src_ptr + 0 * stride);
+ s[1] = vld1q_s16(src_ptr + 1 * stride);
+ s[2] = vld1q_s16(src_ptr + 2 * stride);
+ s[3] = vld1q_s16(src_ptr + 3 * stride);
+ s[4] = vld1q_s16(src_ptr + 4 * stride);
+ s[5] = vld1q_s16(src_ptr + 5 * stride);
+ s[6] = vld1q_s16(src_ptr + 6 * stride);
+ s[7] = vld1q_s16(src_ptr + 7 * stride);
+
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[0]), vget_low_s16(s[0]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[1]), vget_low_s16(s[1]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[2]), vget_low_s16(s[2]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[3]), vget_low_s16(s[3]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[4]), vget_low_s16(s[4]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[5]), vget_low_s16(s[5]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[6]), vget_low_s16(s[6]));
+ sum_s32[0] =
+ vmlal_s16(sum_s32[0], vget_low_s16(s[7]), vget_low_s16(s[7]));
+
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[0]), vget_high_s16(s[0]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[1]), vget_high_s16(s[1]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[2]), vget_high_s16(s[2]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[3]), vget_high_s16(s[3]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[4]), vget_high_s16(s[4]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[5]), vget_high_s16(s[5]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[6]), vget_high_s16(s[6]));
+ sum_s32[1] =
+ vmlal_s16(sum_s32[1], vget_high_s16(s[7]), vget_high_s16(s[7]));
+
+ src_ptr += 8;
+ cols -= 8;
+ } while (cols);
+
+ sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[0]));
+ sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[1]));
+ src += 8 * stride;
+ rows -= 8;
+ } while (rows);
+
+ return horizontal_add_uint64x2(sum_u64);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
new file mode 100644
index 0000000000..74f85a6bb6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -0,0 +1,1546 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
+#define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+// Transpose 64 bit elements as follows:
+// a0: 00 01 02 03 04 05 06 07
+// a1: 16 17 18 19 20 21 22 23
+//
+// b0.val[0]: 00 01 02 03 16 17 18 19
+// b0.val[1]: 04 05 06 07 20 21 22 23
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+ int16x8x2_t b0;
+#if VPX_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_s16_s64(
+ vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+ b0.val[1] = vreinterpretq_s16_s64(
+ vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+ b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+ vreinterpret_s16_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+ vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+ int32x4x2_t b0;
+#if VPX_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_s32_s64(
+ vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+ b0.val[1] = vreinterpretq_s32_s64(
+ vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+ b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+ b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
+ return b0;
+}
+
+static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
+ int64x2x2_t b0;
+#if VPX_ARCH_AARCH64
+ b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+ b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+#else
+ b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
+ vreinterpret_s64_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
+ vreinterpret_s64_s32(vget_high_s32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
+ uint8x16x2_t b0;
+#if VPX_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_u8_u64(
+ vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+ b0.val[1] = vreinterpretq_u8_u64(
+ vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
+ b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
+ vreinterpret_u8_u32(vget_low_u32(a1)));
+ b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
+ vreinterpret_u8_u32(vget_high_u32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
+ uint16x8x2_t b0;
+#if VPX_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_u16_u64(
+ vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+ b0.val[1] = vreinterpretq_u16_u64(
+ vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
+ b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+ vreinterpret_u16_u32(vget_low_u32(a1)));
+ b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+ vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 10 11 12 13
+ // a1: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const uint16x4x2_t b0 =
+ vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 02 03 22 23
+ // c0.val[1]: 10 11 30 31 12 13 32 33
+
+ const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+ vreinterpret_u32_u16(b0.val[1]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 02 12 22 32
+ // d0.val[1]: 01 11 21 31 03 13 23 33
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+}
+
+static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
+ int16x4_t *a2, int16x4_t *a3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+ const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+ vreinterpret_s32_s16(b1.val[0]));
+ const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+ vreinterpret_s32_s16(b1.val[1]));
+
+ *a0 = vreinterpret_s16_s32(c0.val[0]);
+ *a1 = vreinterpret_s16_s32(c1.val[0]);
+ *a2 = vreinterpret_s16_s32(c0.val[1]);
+ *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 10 11 12 13
+ // a1: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const int32x4x2_t b0 =
+ vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
+
+ // Swap 64 bit elements resulting in:
+ // c0: 00 01 20 21 02 03 22 23
+ // c1: 10 11 30 31 12 13 32 33
+
+ const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]);
+
+ // Swap 16 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 02 12 22 32
+ // d0.val[1]: 01 11 21 31 03 13 23 33
+
+ const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 10 11 12 13
+ // a1: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const uint32x4x2_t b0 =
+ vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1));
+
+ // Swap 64 bit elements resulting in:
+ // c0: 00 01 20 21 02 03 22 23
+ // c1: 10 11 30 31 12 13 32 33
+
+ const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]);
+
+ // Swap 16 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 02 12 22 32
+ // d0.val[1]: 01 11 21 31 03 13 23 33
+
+ const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, const uint8x8_t a4,
+ const uint8x8_t a5, const uint8x8_t a6,
+ const uint8x8_t a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 XX XX XX XX
+ // a1: 10 11 12 13 XX XX XX XX
+ // a2: 20 21 22 23 XX XX XX XX
+ // a3; 30 31 32 33 XX XX XX XX
+ // a4: 40 41 42 43 XX XX XX XX
+ // a5: 50 51 52 53 XX XX XX XX
+ // a6: 60 61 62 63 XX XX XX XX
+ // a7: 70 71 72 73 XX XX XX XX
+ // to:
+ // b0.val[0]: 00 01 02 03 40 41 42 43
+ // b1.val[0]: 10 11 12 13 50 51 52 53
+ // b2.val[0]: 20 21 22 23 60 61 62 63
+ // b3.val[0]: 30 31 32 33 70 71 72 73
+
+ const uint32x2x2_t b0 =
+ vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+ const uint32x2x2_t b1 =
+ vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+ const uint32x2x2_t b2 =
+ vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+ const uint32x2x2_t b3 =
+ vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 40 41 60 61
+ // c0.val[1]: 02 03 22 23 42 43 62 63
+ // c1.val[0]: 10 11 30 31 50 51 70 71
+ // c1.val[1]: 12 13 32 33 52 53 72 73
+
+ const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+ vreinterpret_u16_u32(b2.val[0]));
+ const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+ vreinterpret_u16_u32(b3.val[0]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 01 11 21 31 41 51 61 71
+ // d1.val[0]: 02 12 22 32 42 52 62 72
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+ const uint8x8x2_t d1 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+ *a2 = d1.val[0];
+ *a3 = d1.val[1];
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+ int32x4_t *a2, int32x4_t *a3) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+ const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+ *a0 = c0.val[0];
+ *a1 = c1.val[0];
+ *a2 = c0.val[1];
+ *a3 = c1.val[1];
+}
+
+static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
+ const int16x4_t a2, const int16x4_t a3,
+ const int16x4_t a4, const int16x4_t a5,
+ const int16x4_t a6, const int16x4_t a7,
+ int16x8_t *const o0, int16x8_t *const o1,
+ int16x8_t *const o2, int16x8_t *const o3) {
+ // Combine rows. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0: 00 01 02 03 40 41 42 43
+ // b1: 10 11 12 13 50 51 52 53
+ // b2: 20 21 22 23 60 61 62 63
+ // b3: 30 31 32 33 70 71 72 73
+
+ const int16x8_t b0 = vcombine_s16(a0, a4);
+ const int16x8_t b1 = vcombine_s16(a1, a5);
+ const int16x8_t b2 = vcombine_s16(a2, a6);
+ const int16x8_t b3 = vcombine_s16(a3, a7);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 02 12 40 50 42 52
+ // c0.val[1]: 01 11 03 13 41 51 43 53
+ // c1.val[0]: 20 30 22 32 60 70 62 72
+ // c1.val[1]: 21 31 23 33 61 71 63 73
+
+ const int16x8x2_t c0 = vtrnq_s16(b0, b1);
+ const int16x8x2_t c1 = vtrnq_s16(b2, b3);
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 02 12 22 32 42 52 62 72
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+
+ *o0 = vreinterpretq_s16_s32(d0.val[0]);
+ *o1 = vreinterpretq_s16_s32(d1.val[0]);
+ *o2 = vreinterpretq_s16_s32(d0.val[1]);
+ *o3 = vreinterpretq_s16_s32(d1.val[1]);
+}
+
+static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
+ int32x4_t *const a2, int32x4_t *const a3,
+ int32x4_t *const a4, int32x4_t *const a5,
+ int32x4_t *const a6, int32x4_t *const a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+ // b2.val[0]: 40 50 42 52
+ // b2.val[1]: 41 51 43 53
+ // b3.val[0]: 60 70 62 72
+ // b3.val[1]: 61 71 63 73
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+ const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+ const int32x4x2_t b2 = vtrnq_s32(*a4, *a5);
+ const int32x4x2_t b3 = vtrnq_s32(*a6, *a7);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 40 50 60 70
+ // c2.val[1]: 42 52 62 72
+ // c3.val[0]: 41 51 61 71
+ // c3.val[1]: 43 53 63 73
+
+ const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]);
+ const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]);
+ const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]);
+ const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]);
+
+ *a0 = vreinterpretq_s32_s64(c0.val[0]);
+ *a1 = vreinterpretq_s32_s64(c2.val[0]);
+ *a2 = vreinterpretq_s32_s64(c1.val[0]);
+ *a3 = vreinterpretq_s32_s64(c3.val[0]);
+ *a4 = vreinterpretq_s32_s64(c0.val[1]);
+ *a5 = vreinterpretq_s32_s64(c2.val[1]);
+ *a6 = vreinterpretq_s32_s64(c1.val[1]);
+ *a7 = vreinterpretq_s32_s64(c3.val[1]);
+}
+
+static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+ const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const uint16x4x2_t c0 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+ const uint16x4x2_t c1 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+ *a0 = vreinterpret_u8_u16(c0.val[0]);
+ *a1 = vreinterpret_u8_u16(c1.val[0]);
+ *a2 = vreinterpret_u8_u16(c0.val[1]);
+ *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
+ uint16x8_t *a2, uint16x8_t *a3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+ const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+
+ *a0 = vreinterpretq_u16_u32(c0.val[0]);
+ *a1 = vreinterpretq_u16_u32(c1.val[0]);
+ *a2 = vreinterpretq_u16_u32(c0.val[1]);
+ *a3 = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
+ int32x4_t *const a2, int32x4_t *const a3,
+ int32x4_t *const a4, int32x4_t *const a5,
+ int32x4_t *const a6, int32x4_t *const a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 04 05 06 07
+ // a2: 10 11 12 13
+ // a3: 14 15 16 17
+ // a4: 20 21 22 23
+ // a5: 24 25 26 27
+ // a6: 30 31 32 33
+ // a7: 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 04 14 06 16
+ // b1.val[1]: 05 15 07 17
+ // b2.val[0]: 20 30 22 32
+ // b2.val[1]: 21 31 23 33
+ // b3.val[0]: 24 34 26 36
+ // b3.val[1]: 25 35 27 37
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a2);
+ const int32x4x2_t b1 = vtrnq_s32(*a1, *a3);
+ const int32x4x2_t b2 = vtrnq_s32(*a4, *a6);
+ const int32x4x2_t b3 = vtrnq_s32(*a5, *a7);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 04 14 24 34
+ // c2.val[1]: 06 16 26 36
+ // c3.val[0]: 05 15 25 35
+ // c3.val[1]: 07 17 27 37
+
+ const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]);
+ const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]);
+ const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]);
+ const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]);
+
+ *a0 = vreinterpretq_s32_s64(c0.val[0]);
+ *a1 = vreinterpretq_s32_s64(c1.val[0]);
+ *a2 = vreinterpretq_s32_s64(c0.val[1]);
+ *a3 = vreinterpretq_s32_s64(c1.val[1]);
+ *a4 = vreinterpretq_s32_s64(c2.val[0]);
+ *a5 = vreinterpretq_s32_s64(c3.val[0]);
+ *a6 = vreinterpretq_s32_s64(c2.val[1]);
+ *a7 = vreinterpretq_s32_s64(c3.val[1]);
+}
+
+// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
+// 'q' registers here to save some instructions.
+static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
+ uint8x8_t *a6, uint8x8_t *a7) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
+ // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
+ // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
+ // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
+
+ const uint8x16x2_t b0 =
+ vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
+ const uint8x16x2_t b1 =
+ vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
+ // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
+ // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
+ // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ // Unzip 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+ *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+ *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+ *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+ *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+// Transpose 8x8 to a new location.
+static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ out[0] = d0.val[0];
+ out[1] = d1.val[0];
+ out[2] = d2.val[0];
+ out[3] = d3.val[0];
+ out[4] = d0.val[1];
+ out[5] = d1.val[1];
+ out[6] = d2.val[1];
+ out[7] = d3.val[1];
+}
+
+static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6, int16x8_t *a7) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+ const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+ const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
+ const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d1.val[0];
+ *a2 = d2.val[0];
+ *a3 = d3.val[0];
+ *a4 = d0.val[1];
+ *a5 = d1.val[1];
+ *a6 = d2.val[1];
+ *a7 = d3.val[1];
+}
+
+static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
+ uint16x8_t *a2, uint16x8_t *a3,
+ uint16x8_t *a4, uint16x8_t *a5,
+ uint16x8_t *a6, uint16x8_t *a7) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+ const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+ const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+ const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+ const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+ vreinterpretq_u32_u16(b3.val[0]));
+ const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+ vreinterpretq_u32_u16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
+ const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
+ const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
+ const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d1.val[0];
+ *a2 = d2.val[0];
+ *a3 = d3.val[0];
+ *a4 = d0.val[1];
+ *a5 = d1.val[1];
+ *a6 = d2.val[1];
+ *a7 = d3.val[1];
+}
+
+static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
+ int32x4x2_t *a2, int32x4x2_t *a3,
+ int32x4x2_t *a4, int32x4x2_t *a5,
+ int32x4x2_t *a6, int32x4x2_t *a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0: 00 10 02 12 01 11 03 13
+ // b1: 20 30 22 32 21 31 23 33
+ // b2: 40 50 42 52 41 51 43 53
+ // b3: 60 70 62 72 61 71 63 73
+ // b4: 04 14 06 16 05 15 07 17
+ // b5: 24 34 26 36 25 35 27 37
+ // b6: 44 54 46 56 45 55 47 57
+ // b7: 64 74 66 76 65 75 67 77
+
+ const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
+ const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
+ const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
+ const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
+ const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
+ const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
+ const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
+ const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
+
+ // Swap 64 bit elements resulting in:
+ // c0: 00 10 20 30 02 12 22 32
+ // c1: 01 11 21 31 03 13 23 33
+ // c2: 40 50 60 70 42 52 62 72
+ // c3: 41 51 61 71 43 53 63 73
+ // c4: 04 14 24 34 06 16 26 36
+ // c5: 05 15 25 35 07 17 27 37
+ // c6: 44 54 64 74 46 56 66 76
+ // c7: 45 55 65 75 47 57 67 77
+ const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+ const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
+ const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
+ const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
+ const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
+ const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
+ const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
+
+ // Swap 128 bit elements resulting in:
+ // a0: 00 10 20 30 40 50 60 70
+ // a1: 01 11 21 31 41 51 61 71
+ // a2: 02 12 22 32 42 52 62 72
+ // a3: 03 13 23 33 43 53 63 73
+ // a4: 04 14 24 34 44 54 64 74
+ // a5: 05 15 25 35 45 55 65 75
+ // a6: 06 16 26 36 46 56 66 76
+ // a7: 07 17 27 37 47 57 67 77
+ a0->val[0] = c0.val[0];
+ a0->val[1] = c2.val[0];
+ a1->val[0] = c1.val[0];
+ a1->val[1] = c3.val[0];
+ a2->val[0] = c0.val[1];
+ a2->val[1] = c2.val[1];
+ a3->val[0] = c1.val[1];
+ a3->val[1] = c3.val[1];
+ a4->val[0] = c4.val[0];
+ a4->val[1] = c6.val[0];
+ a5->val[0] = c5.val[0];
+ a5->val[1] = c7.val[0];
+ a6->val[0] = c4.val[1];
+ a6->val[1] = c6.val[1];
+ a7->val[0] = c5.val[1];
+ a7->val[1] = c7.val[1];
+}
+
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+ int32x4_t *right /*[8]*/,
+ int32x4_t *out_left /*[8]*/,
+ int32x4_t *out_right /*[8]*/) {
+ int32x4x2_t out[8];
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ out_left[0] = out[0].val[0];
+ out_left[1] = out[1].val[0];
+ out_left[2] = out[2].val[0];
+ out_left[3] = out[3].val[0];
+ out_left[4] = out[4].val[0];
+ out_left[5] = out[5].val[0];
+ out_left[6] = out[6].val[0];
+ out_left[7] = out[7].val[0];
+ out_right[0] = out[0].val[1];
+ out_right[1] = out[1].val[1];
+ out_right[2] = out[2].val[1];
+ out_right[3] = out[3].val[1];
+ out_right[4] = out[4].val[1];
+ out_right[5] = out[5].val[1];
+ out_right[6] = out[6].val[1];
+ out_right[7] = out[7].val[1];
+}
+
+static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1,
+ int32x4_t *left2, int32x4_t *right2) {
+ int32x4_t tl[16], tr[16];
+
+ // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+ tl[0] = left1[8];
+ tl[1] = left1[9];
+ tl[2] = left1[10];
+ tl[3] = left1[11];
+ tl[4] = left1[12];
+ tl[5] = left1[13];
+ tl[6] = left1[14];
+ tl[7] = left1[15];
+ tr[0] = right1[8];
+ tr[1] = right1[9];
+ tr[2] = right1[10];
+ tr[3] = right1[11];
+ tr[4] = right1[12];
+ tr[5] = right1[13];
+ tr[6] = right1[14];
+ tr[7] = right1[15];
+
+ left1[8] = left2[0];
+ left1[9] = left2[1];
+ left1[10] = left2[2];
+ left1[11] = left2[3];
+ left1[12] = left2[4];
+ left1[13] = left2[5];
+ left1[14] = left2[6];
+ left1[15] = left2[7];
+ right1[8] = right2[0];
+ right1[9] = right2[1];
+ right1[10] = right2[2];
+ right1[11] = right2[3];
+ right1[12] = right2[4];
+ right1[13] = right2[5];
+ right1[14] = right2[6];
+ right1[15] = right2[7];
+
+ left2[0] = tl[0];
+ left2[1] = tl[1];
+ left2[2] = tl[2];
+ left2[3] = tl[3];
+ left2[4] = tl[4];
+ left2[5] = tl[5];
+ left2[6] = tl[6];
+ left2[7] = tl[7];
+ right2[0] = tr[0];
+ right2[1] = tr[1];
+ right2[2] = tr[2];
+ right2[3] = tr[3];
+ right2[4] = tr[4];
+ right2[5] = tr[5];
+ right2[6] = tr[6];
+ right2[7] = tr[7];
+
+ transpose_s32_8x8_2(left1, right1, left1, right1);
+ transpose_s32_8x8_2(left2, right2, left2, right2);
+ transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8);
+ transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8);
+}
+
+static INLINE void transpose_u8_16x8(
+ const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
+ const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
+ const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1,
+ uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
+ uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11,
+ uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) {
+ // Swap 8 bit elements. Goes from:
+ // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
+ // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
+ // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
+ // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
+ // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
+ // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
+ // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
+ // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
+ // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
+ // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
+ // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
+ const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
+ const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
+ const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
+ const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
+ // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
+ // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
+ // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
+ // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
+ // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
+ // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
+ // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+ const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+ vreinterpretq_u16_u8(b3.val[0]));
+ const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+ vreinterpretq_u16_u8(b3.val[1]));
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
+ // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
+ // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
+ // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
+ // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
+ // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
+ // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
+ // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c2.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c2.val[1]));
+ const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+ vreinterpretq_u32_u16(c3.val[0]));
+ const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+ vreinterpretq_u32_u16(c3.val[1]));
+
+ // Output:
+ // o0 : 00 10 20 30 40 50 60 70
+ // o1 : 01 11 21 31 41 51 61 71
+ // o2 : 02 12 22 32 42 52 62 72
+ // o3 : 03 13 23 33 43 53 63 73
+ // o4 : 04 14 24 34 44 54 64 74
+ // o5 : 05 15 25 35 45 55 65 75
+ // o6 : 06 16 26 36 46 56 66 76
+ // o7 : 07 17 27 37 47 57 67 77
+ // o8 : 08 18 28 38 48 58 68 78
+ // o9 : 09 19 29 39 49 59 69 79
+ // o10: 0A 1A 2A 3A 4A 5A 6A 7A
+ // o11: 0B 1B 2B 3B 4B 5B 6B 7B
+ // o12: 0C 1C 2C 3C 4C 5C 6C 7C
+ // o13: 0D 1D 2D 3D 4D 5D 6D 7D
+ // o14: 0E 1E 2E 3E 4E 5E 6E 7E
+ // o15: 0F 1F 2F 3F 4F 5F 6F 7F
+ *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
+ *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
+ *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
+ *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
+ *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
+ *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
+ *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
+ *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
+ *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
+ *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
+ *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
+ *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
+ *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
+ *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
+ *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
+ *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
+}
+
+static INLINE void transpose_u8_8x16(
+ const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2,
+ const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5,
+ const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8,
+ const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11,
+ const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14,
+ const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
+ uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
+ uint8x16_t *o7) {
+ // Combine 8 bit elements. Goes from:
+ // i0 : 00 01 02 03 04 05 06 07
+ // i1 : 10 11 12 13 14 15 16 17
+ // i2 : 20 21 22 23 24 25 26 27
+ // i3 : 30 31 32 33 34 35 36 37
+ // i4 : 40 41 42 43 44 45 46 47
+ // i5 : 50 51 52 53 54 55 56 57
+ // i6 : 60 61 62 63 64 65 66 67
+ // i7 : 70 71 72 73 74 75 76 77
+ // i8 : 80 81 82 83 84 85 86 87
+ // i9 : 90 91 92 93 94 95 96 97
+ // i10: A0 A1 A2 A3 A4 A5 A6 A7
+ // i11: B0 B1 B2 B3 B4 B5 B6 B7
+ // i12: C0 C1 C2 C3 C4 C5 C6 C7
+ // i13: D0 D1 D2 D3 D4 D5 D6 D7
+ // i14: E0 E1 E2 E3 E4 E5 E6 E7
+ // i15: F0 F1 F2 F3 F4 F5 F6 F7
+ // to:
+ // a0: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
+ // a1: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
+ // a2: 20 21 22 23 24 25 26 27 A0 A1 A2 A3 A4 A5 A6 A7
+ // a3: 30 31 32 33 34 35 36 37 B0 B1 B2 B3 B4 B5 B6 B7
+ // a4: 40 41 42 43 44 45 46 47 C0 C1 C2 C3 C4 C5 C6 C7
+ // a5: 50 51 52 53 54 55 56 57 D0 D1 D2 D3 D4 D5 D6 D7
+ // a6: 60 61 62 63 64 65 66 67 E0 E1 E2 E3 E4 E5 E6 E7
+ // a7: 70 71 72 73 74 75 76 77 F0 F1 F2 F3 F4 F5 F6 F7
+ const uint8x16_t a0 = vcombine_u8(i0, i8);
+ const uint8x16_t a1 = vcombine_u8(i1, i9);
+ const uint8x16_t a2 = vcombine_u8(i2, i10);
+ const uint8x16_t a3 = vcombine_u8(i3, i11);
+ const uint8x16_t a4 = vcombine_u8(i4, i12);
+ const uint8x16_t a5 = vcombine_u8(i5, i13);
+ const uint8x16_t a6 = vcombine_u8(i6, i14);
+ const uint8x16_t a7 = vcombine_u8(i7, i15);
+
+ // Swap 8 bit elements resulting in:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
+ // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
+ // b1.val[0]: 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6
+ // b1.val[1]: 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7
+ // b2.val[0]: 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6
+ // b2.val[1]: 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7
+ // b3.val[0]: 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6
+ // b3.val[1]: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7
+ const uint8x16x2_t b0 = vtrnq_u8(a0, a1);
+ const uint8x16x2_t b1 = vtrnq_u8(a2, a3);
+ const uint8x16x2_t b2 = vtrnq_u8(a4, a5);
+ const uint8x16x2_t b3 = vtrnq_u8(a6, a7);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4
+ // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6
+ // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5
+ // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7
+ // c2.val[0]: 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4
+ // c2.val[1]: 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6
+ // c3.val[0]: 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5
+ // c3.val[1]: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+ const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+ vreinterpretq_u16_u8(b3.val[0]));
+ const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+ vreinterpretq_u16_u8(b3.val[1]));
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
+ // d1.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ // d1.val[1]: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
+ // d2.val[0]: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
+ // d2.val[1]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
+ // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c2.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c2.val[1]));
+ const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+ vreinterpretq_u32_u16(c3.val[0]));
+ const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+ vreinterpretq_u32_u16(c3.val[1]));
+
+ // Output:
+ // o0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ // o1: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
+ // o2: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ // o3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
+ // o4: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
+ // o5: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ // o6: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
+ // o7: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+ *o0 = vreinterpretq_u8_u32(d0.val[0]);
+ *o1 = vreinterpretq_u8_u32(d2.val[0]);
+ *o2 = vreinterpretq_u8_u32(d1.val[0]);
+ *o3 = vreinterpretq_u8_u32(d3.val[0]);
+ *o4 = vreinterpretq_u8_u32(d0.val[1]);
+ *o5 = vreinterpretq_u8_u32(d2.val[1]);
+ *o6 = vreinterpretq_u8_u32(d1.val[1]);
+ *o7 = vreinterpretq_u8_u32(d3.val[1]);
+}
+
+static INLINE void transpose_u8_16x16(
+ const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
+ const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
+ const uint8x16_t i6, const uint8x16_t i7, const uint8x16_t i8,
+ const uint8x16_t i9, const uint8x16_t i10, const uint8x16_t i11,
+ const uint8x16_t i12, const uint8x16_t i13, const uint8x16_t i14,
+ const uint8x16_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
+ uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
+ uint8x16_t *o7, uint8x16_t *o8, uint8x16_t *o9, uint8x16_t *o10,
+ uint8x16_t *o11, uint8x16_t *o12, uint8x16_t *o13, uint8x16_t *o14,
+ uint8x16_t *o15) {
+ // Swap 8 bit elements. Goes from:
+ // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
+ // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
+ // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
+ // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
+ // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
+ // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
+ // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
+ // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
+ // i8: 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F
+ // i9: 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F
+ // i10: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF
+ // i11: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF
+ // i12: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF
+ // i13: D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF
+ // i14: E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF
+ // i15: F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+ // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+ // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+ // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+ // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
+ // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
+ // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
+ // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
+ // b4.val[0]: 80 90 82 92 84 94 86 96 88 98 8A 9A 8C 9C 8E 9E
+ // b4.val[1]: 81 91 83 93 85 95 87 97 89 99 8B 9B 8D 9D 8F 9F
+ // b5.val[0]: A0 B0 A2 B2 A4 B4 A6 B6 A8 B8 AA BA AC BC AE BE
+ // b5.val[1]: A1 B1 A3 B3 A5 B5 A7 B7 A9 B9 AB BB AD BD AF BF
+ // b6.val[0]: C0 D0 C2 D2 C4 D4 C6 D6 C8 D8 CA DA CC DC CE DE
+ // b6.val[1]: C1 D1 C3 D3 C5 D5 C7 D7 C9 D9 CB DB CD DD CF DF
+ // b7.val[0]: E0 F0 E2 F2 E4 F4 E6 F6 E8 F8 EA FA EC FC EE FE
+ // b7.val[1]: E1 F1 E3 F3 E5 F5 E7 F7 E9 F9 EB FB ED FD EF FF
+ const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
+ const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
+ const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
+ const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
+ const uint8x16x2_t b4 = vtrnq_u8(i8, i9);
+ const uint8x16x2_t b5 = vtrnq_u8(i10, i11);
+ const uint8x16x2_t b6 = vtrnq_u8(i12, i13);
+ const uint8x16x2_t b7 = vtrnq_u8(i14, i15);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
+ // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
+ // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
+ // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
+ // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
+ // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
+ // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
+ // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
+ // c4.val[0]: 80 90 A0 B0 84 94 A4 B4 88 98 A8 B8 8C 9C AC BC
+ // c4.val[1]: 82 92 A2 B2 86 96 A6 B6 8A 9A AA BA 8E 9E AE BE
+ // c5.val[0]: 81 91 A1 B1 85 95 A5 B5 89 99 A9 B9 8D 9D AD BD
+ // c5.val[1]: 83 93 A3 B3 87 97 A7 B7 8B 9B AB BB 8F 9F AF BF
+ // c6.val[0]: C0 D0 E0 F0 C4 D4 E4 F4 C8 D8 E8 F8 CC DC EC FC
+ // c6.val[1]: C2 D2 E2 F2 C6 D6 E6 F6 CA DA EA FA CE DE EE FE
+ // c7.val[0]: C1 D1 E1 F1 C5 D5 E5 F5 C9 D9 E9 F9 CD DD ED FD
+ // c7.val[1]: C3 D3 E3 F3 C7 D7 E7 F7 CB DB EB FB CF DF EF FF
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+ const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+ vreinterpretq_u16_u8(b3.val[0]));
+ const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+ vreinterpretq_u16_u8(b3.val[1]));
+ const uint16x8x2_t c4 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[0]),
+ vreinterpretq_u16_u8(b5.val[0]));
+ const uint16x8x2_t c5 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[1]),
+ vreinterpretq_u16_u8(b5.val[1]));
+ const uint16x8x2_t c6 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[0]),
+ vreinterpretq_u16_u8(b7.val[0]));
+ const uint16x8x2_t c7 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[1]),
+ vreinterpretq_u16_u8(b7.val[1]));
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
+ // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
+ // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
+ // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
+ // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
+ // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
+ // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
+ // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
+ // d4.val[0]: 80 90 A0 B0 C0 D0 E0 F0 88 98 A8 B8 C8 D8 E8 F8
+ // d4.val[1]: 84 94 A4 B4 C4 D4 E4 F4 8C 9C AC BC CC DC EC FC
+ // d5.val[0]: 82 92 A2 B2 C2 D2 E2 F2 8A 9A AA BA CA DA EA FA
+ // d5.val[1]: 86 96 A6 B6 C6 D6 E6 F6 8E 9E AE BE CE DE EE FE
+ // d6.val[0]: 81 91 A1 B1 C1 D1 E1 F1 89 99 A9 B9 C9 D9 E9 F9
+ // d6.val[1]: 85 95 A5 B5 C5 D5 E5 F5 8D 9D AD BD CD DD ED FD
+ // d7.val[0]: 83 93 A3 B3 C3 D3 E3 F3 8B 9B AB BB CB DB EB FB
+ // d7.val[1]: 87 97 A7 B7 C7 D7 E7 F7 8F 9F AF BF CF DF EF FF
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c2.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c2.val[1]));
+ const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+ vreinterpretq_u32_u16(c3.val[0]));
+ const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+ vreinterpretq_u32_u16(c3.val[1]));
+ const uint32x4x2_t d4 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[0]),
+ vreinterpretq_u32_u16(c6.val[0]));
+ const uint32x4x2_t d5 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[1]),
+ vreinterpretq_u32_u16(c6.val[1]));
+ const uint32x4x2_t d6 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[0]),
+ vreinterpretq_u32_u16(c7.val[0]));
+ const uint32x4x2_t d7 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[1]),
+ vreinterpretq_u32_u16(c7.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // e0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ // e0.val[1]: 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
+ // e1.val[0]: 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
+ // e1.val[1]: 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
+ // e2.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ // e2.val[1]: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
+ // e3.val[0]: 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
+ // e3.val[1]: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
+ // e4.val[0]: 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
+ // e4.val[1]: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
+ // e5.val[0]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ // e5.val[1]: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
+ // e6.val[0]: 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
+ // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
+ // e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+ // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
+ const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]);
+ const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]);
+ const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]);
+ const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]);
+ const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]);
+ const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]);
+ const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]);
+ const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]);
+
+ // Output:
+ // o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
+ // o1 : 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
+ // o2 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
+ // o3 : 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
+ // o4 : 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
+ // o5 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
+ // o6 : 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
+ // o7 : 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
+ // o8 : 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
+ // o9 : 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
+ // o10: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
+ // o11: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
+ // o12: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
+ // o13: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
+ // o14: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
+ // o15: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
+ *o0 = e0.val[0];
+ *o1 = e1.val[0];
+ *o2 = e2.val[0];
+ *o3 = e3.val[0];
+ *o4 = e4.val[0];
+ *o5 = e5.val[0];
+ *o6 = e6.val[0];
+ *o7 = e7.val[0];
+ *o8 = e0.val[1];
+ *o9 = e1.val[1];
+ *o10 = e2.val[1];
+ *o11 = e3.val[1];
+ *o12 = e4.val[1];
+ *o13 = e5.val[1];
+ *o14 = e6.val[1];
+ *o15 = e7.val[1];
+}
+
+static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
+ int16x8_t t[8];
+
+ // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+ t[0] = in0[8];
+ t[1] = in0[9];
+ t[2] = in0[10];
+ t[3] = in0[11];
+ t[4] = in0[12];
+ t[5] = in0[13];
+ t[6] = in0[14];
+ t[7] = in0[15];
+ in0[8] = in1[0];
+ in0[9] = in1[1];
+ in0[10] = in1[2];
+ in0[11] = in1[3];
+ in0[12] = in1[4];
+ in0[13] = in1[5];
+ in0[14] = in1[6];
+ in0[15] = in1[7];
+ in1[0] = t[0];
+ in1[1] = t[1];
+ in1[2] = t[2];
+ in1[3] = t[3];
+ in1[4] = t[4];
+ in1[5] = t[5];
+ in1[6] = t[6];
+ in1[7] = t[7];
+
+ transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
+ &in0[6], &in0[7]);
+ transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
+ &in0[14], &in0[15]);
+ transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
+ &in1[6], &in1[7]);
+ transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
+ &in1[14], &in1[15]);
+}
+
+static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
+ const int a_stride, uint8x8_t *a0,
+ uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3) {
+ uint8x8_t a4, a5, a6, a7;
+ *a0 = vld1_u8(a);
+ a += a_stride;
+ *a1 = vld1_u8(a);
+ a += a_stride;
+ *a2 = vld1_u8(a);
+ a += a_stride;
+ *a3 = vld1_u8(a);
+ a += a_stride;
+ a4 = vld1_u8(a);
+ a += a_stride;
+ a5 = vld1_u8(a);
+ a += a_stride;
+ a6 = vld1_u8(a);
+ a += a_stride;
+ a7 = vld1_u8(a);
+
+ transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,
+ const int a_stride, uint8x8_t *a0,
+ uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, uint8x8_t *a4,
+ uint8x8_t *a5, uint8x8_t *a6,
+ uint8x8_t *a7) {
+ *a0 = vld1_u8(a);
+ a += a_stride;
+ *a1 = vld1_u8(a);
+ a += a_stride;
+ *a2 = vld1_u8(a);
+ a += a_stride;
+ *a3 = vld1_u8(a);
+ a += a_stride;
+ *a4 = vld1_u8(a);
+ a += a_stride;
+ *a5 = vld1_u8(a);
+ a += a_stride;
+ *a6 = vld1_u8(a);
+ a += a_stride;
+ *a7 = vld1_u8(a);
+
+ transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,
+ uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x8_t a4, uint8x8_t a5,
+ uint8x8_t a6, uint8x8_t a7) {
+ transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ vst1_u8(a, a0);
+ a += a_stride;
+ vst1_u8(a, a1);
+ a += a_stride;
+ vst1_u8(a, a2);
+ a += a_stride;
+ vst1_u8(a, a3);
+ a += a_stride;
+ vst1_u8(a, a4);
+ a += a_stride;
+ vst1_u8(a, a5);
+ a += a_stride;
+ vst1_u8(a, a6);
+ a += a_stride;
+ vst1_u8(a, a7);
+}
+
+static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
+ const int a_stride, int16x8_t *a0,
+ int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4,
+ int16x8_t *a5, int16x8_t *a6,
+ int16x8_t *a7) {
+ *a0 = vld1q_s16(a);
+ a += a_stride;
+ *a1 = vld1q_s16(a);
+ a += a_stride;
+ *a2 = vld1q_s16(a);
+ a += a_stride;
+ *a3 = vld1q_s16(a);
+ a += a_stride;
+ *a4 = vld1q_s16(a);
+ a += a_stride;
+ *a5 = vld1q_s16(a);
+ a += a_stride;
+ *a6 = vld1q_s16(a);
+ a += a_stride;
+ *a7 = vld1q_s16(a);
+
+ transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void load_and_transpose_s32_8x8(
+ const int32_t *a, const int a_stride, int32x4x2_t *const a0,
+ int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3,
+ int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6,
+ int32x4x2_t *const a7) {
+ a0->val[0] = vld1q_s32(a);
+ a0->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a1->val[0] = vld1q_s32(a);
+ a1->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a2->val[0] = vld1q_s32(a);
+ a2->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a3->val[0] = vld1q_s32(a);
+ a3->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a4->val[0] = vld1q_s32(a);
+ a4->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a5->val[0] = vld1q_s32(a);
+ a5->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a6->val[0] = vld1q_s32(a);
+ a6->val[1] = vld1q_s32(a + 4);
+ a += a_stride;
+ a7->val[0] = vld1q_s32(a);
+ a7->val[1] = vld1q_s32(a + 4);
+
+ transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+#endif // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c
new file mode 100644
index 0000000000..69ff1cf153
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+// Process a block of width 4 four rows at a time.
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+ const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ src_ptr += 4 * src_stride;
+ ref_ptr += 4 * ref_stride;
+ i -= 4;
+ } while (i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 8 two rows at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ const uint8x16_t s =
+ vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
+ const uint8x16_t r =
+ vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int w, int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr + j);
+ const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+ const uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ j += 16;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int32x4(
+ vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+ *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+// Process a block of width 4 two rows at a time.
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ int i = h;
+
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
+ assert(h <= 256);
+
+ do {
+ const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_int16x8(sum_s16);
+ *sse = (uint32_t)horizontal_add_int32x4(sse_s32);
+}
+
+// Process a block of width 8 one row at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int i = h;
+
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128
+ assert(h <= 128);
+
+ do {
+ const uint8x8_t s = vld1_u8(src_ptr);
+ const uint8x8_t r = vld1_u8(ref_ptr);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int16x8(sum_s16);
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int i = h;
+
+ // Number of rows we can process before 'sum_s16' accumulators overflow:
+ // 32767 / 255 ~= 128, so 128 16-wide rows.
+ assert(h <= 128);
+
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ const int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int w, int h, int h_limit,
+ unsigned int *sse, int *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+ // accumulator overflows. After hitting this limit we accumulate into 32-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ int i = 0;
+ do {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s = vld1q_u8(src_ptr + j);
+ const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+ const int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ const int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ j += 16;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ i++;
+ } while (i < h_tmp);
+
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+ h_tmp += h_limit;
+ } while (i < h);
+
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
+}
+
+void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum);
+}
+
+#define VARIANCE_WXH_NEON(w, h, shift) \
+ unsigned int vpx_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
+
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
+
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
+
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
+
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
+
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x8_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ s1 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ r0 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ r1 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+
+ diff0 = vabd_u8(s0, r0);
+ diff1 = vabd_u8(s1, r1);
+
+ sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0);
+ sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ s1 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ r0 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ r1 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+
+ diff0 = vabdq_u8(s0, r0);
+ diff1 = vabdq_u8(s1, r1);
+
+ sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0);
+ sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+ uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+
+ uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff);
+
+ return horizontal_add_uint32x4(sse);
+}
+
+#else // !defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x8_t s0, s1, r0, r1, diff0, diff1;
+ uint16x8_t sse0, sse1;
+
+ s0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ s1 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ r0 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ r1 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+
+ diff0 = vabd_u8(s0, r0);
+ diff1 = vabd_u8(s1, r1);
+
+ sse0 = vmull_u8(diff0, diff0);
+ sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+ sse1 = vmull_u8(diff1, diff1);
+ sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s, r, diff;
+ uint16x8_t sse0, sse1;
+
+ s = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ r = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+
+ diff = vabdq_u8(s, r);
+
+ sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff));
+ sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+ sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff));
+ sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x8_t s[2], r[2];
+ uint16x8_t abs_diff[2];
+ uint32x4_t sse;
+
+ s[0] = load_u8(src_ptr, src_stride);
+ r[0] = load_u8(ref_ptr, ref_stride);
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ s[1] = load_u8(src_ptr, src_stride);
+ r[1] = load_u8(ref_ptr, ref_stride);
+
+ abs_diff[0] = vabdl_u8(s[0], r[0]);
+ abs_diff[1] = vabdl_u8(s[1], r[1]);
+
+ sse = vmull_u16(vget_low_u16(abs_diff[0]), vget_low_u16(abs_diff[0]));
+ sse = vmlal_u16(sse, vget_high_u16(abs_diff[0]), vget_high_u16(abs_diff[0]));
+ sse = vmlal_u16(sse, vget_low_u16(abs_diff[1]), vget_low_u16(abs_diff[1]));
+ sse = vmlal_u16(sse, vget_high_u16(abs_diff[1]), vget_high_u16(abs_diff[1]));
+
+ return horizontal_add_uint32x4(sse);
+}
+
+#endif // defined(__ARM_FEATURE_DOTPROD)
+
+#define VPX_MSE_WXH_NEON(w, h) \
+ unsigned int vpx_mse##w##x##h##_neon( \
+ const unsigned char *src_ptr, int src_stride, \
+ const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \
+ return vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h, \
+ sse); \
+ }
+
+VPX_MSE_WXH_NEON(8, 8)
+VPX_MSE_WXH_NEON(8, 16)
+VPX_MSE_WXH_NEON(16, 8)
+VPX_MSE_WXH_NEON(16, 16)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
new file mode 100644
index 0000000000..d8e4bcc3a7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
@@ -0,0 +1,438 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers*****************************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_avg_horiz_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u8 {d6}, [r1]
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u8 {d7}, [r6]
+ vrhadd.u8 d20, d20, d6
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vrhadd.u8 d8, d8, d7
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlsl.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlal.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlal.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ add r7, r1, #8
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vld1.u8 {d0}, [r1]
+ vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u8 {d2}, [r7]
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vrhadd.u8 d8, d8, d0
+ vrhadd.u8 d9, d9, d2
+ vmlsl.u8 q11, d1, d24
+ vmlsl.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlal.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ vmlal.u8 q11, d13, d28
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ subeq r14, r14, #2
+ vhadd.s16 q5, q5, q10
+ vmlal.u8 q11, d15, d29
+ addeq r1, r1, r8
+ vmlsl.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vdup.16 q10, r7
+ vld1.u32 {q3}, [r12], r11
+ add r7, r6, #8
+ moveq r5, r10
+ vld1.u8 {d0}, [r6]
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u8 {d2}, [r7]
+ vqrshrun.s16 d11, q11, #6
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q6}, [r12], r11
+ vrhadd.u8 d10, d10, d0
+ vld1.u32 {q7}, [r12], r11
+ vrhadd.u8 d11, d11, d2
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ mov r7, #0xc000
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ add r7, r6, #8
+ vld1.u8 {d20}, [r6]
+ vld1.u8 {d21}, [r7]
+ vrhadd.u8 d10, d10, d20
+ vrhadd.u8 d11, d11, d21
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlal.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlal.u8 q4, d5, d29
+ vmlsl.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vld1.u32 {d10[0]}, [r1]
+ vld1.u32 {d10[1]}, [r6]
+ vrhadd.u8 d8, d8, d10
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
new file mode 100644
index 0000000000..7a77747fec
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
@@ -0,0 +1,439 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_avg_horiz_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u8 {d6}, [r1]
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u8 {d7}, [r6]
+ vrhadd.u8 d20, d20, d6
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vrhadd.u8 d8, d8, d7
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlal.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlsl.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlsl.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlal.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ add r7, r1, #8
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vld1.u8 {d0}, [r1]
+ vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u8 {d2}, [r7]
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vrhadd.u8 d8, d8, d0
+ vrhadd.u8 d9, d9, d2
+ vmlsl.u8 q11, d1, d24
+ vmlal.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlsl.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ vmlal.u8 q11, d13, d28
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ subeq r14, r14, #2
+ vhadd.s16 q5, q5, q10
+ vmlsl.u8 q11, d15, d29
+ addeq r1, r1, r8
+ vmlal.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vdup.16 q10, r7
+ vld1.u32 {q3}, [r12], r11
+ add r7, r6, #8
+ moveq r5, r10
+ vld1.u8 {d0}, [r6]
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u8 {d2}, [r7]
+ vqrshrun.s16 d11, q11, #6
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q6}, [r12], r11
+ vrhadd.u8 d10, d10, d0
+ vld1.u32 {q7}, [r12], r11
+ vrhadd.u8 d11, d11, d2
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ mov r7, #0xc000
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ add r7, r6, #8
+ vld1.u8 {d20}, [r6]
+ vld1.u8 {d21}, [r7]
+ vrhadd.u8 d10, d10, d20
+ vrhadd.u8 d11, d11, d21
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlal.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlsl.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlsl.u8 q4, d5, d29
+ vmlal.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vld1.u32 {d10[0]}, [r1]
+ vld1.u32 {d10[1]}, [r6]
+ vrhadd.u8 d8, d8, d10
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
new file mode 100644
index 0000000000..d310a83dad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
@@ -0,0 +1,486 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_avg_vert_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r3, r3, r2
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ add r14, r1, r6
+ vmlal.u8 q6, d7, d27
+ vmlsl.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlsl.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlal.u8 q7, d5, d24
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d6, d25
+ vrhadd.u8 d10, d10, d20
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ addle r0, r0, r8
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vrhadd.u8 d12, d12, d20
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ addle r1, r1, r9
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlsl.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ add r10, r10, r2 ; 11*strd
+ vmlal.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlal.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlsl.u8 q6, d16, d28
+ add r10, r10, r2 ;12*strd
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ subs r7, r7, #4
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vrhadd.u8 d12, d12, d20
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vst1.8 {d12}, [r14], r6
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vst1.8 {d14}, [r14], r6
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vmlal.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlal.u8 q6, d7, d27
+ add r14, r1, r6
+ vmlsl.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlal.u8 q7, d16, d27
+ vmlsl.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d12, d12, d20
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d14, d14, d20
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlsl.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlsl.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlal.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlal.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlsl.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vld1.u32 {d20[0]}, [r1]
+ vld1.u32 {d20[1]}, [r3]
+ vrhadd.u8 d0, d0, d20
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ mov r4, r3
+ vld1.u32 {d20[0]}, [r4], r6
+ vld1.u32 {d20[1]}, [r4]
+ vrhadd.u8 d8, d8, d20
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
new file mode 100644
index 0000000000..c5695fbda8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
@@ -0,0 +1,487 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_avg_vert_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r3, r3, r2
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ add r14, r1, r6
+ vmlsl.u8 q6, d7, d27
+ vmlal.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlal.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlsl.u8 q7, d5, d24
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d6, d25
+ vrhadd.u8 d10, d10, d20
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ addle r0, r0, r8
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vrhadd.u8 d12, d12, d20
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ addle r1, r1, r9
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlal.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ add r10, r10, r2 ; 11*strd
+ vmlsl.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlsl.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlal.u8 q6, d16, d28
+ add r10, r10, r2 ;12*strd
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ subs r7, r7, #4
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vld1.u8 {d20}, [r14]
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vrhadd.u8 d12, d12, d20
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vst1.8 {d12}, [r14], r6
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vst1.8 {d14}, [r14], r6
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vmlsl.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlsl.u8 q6, d7, d27
+ add r14, r1, r6
+ vmlal.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlsl.u8 q7, d16, d27
+ vmlal.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d12, d12, d20
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d14, d14, d20
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlal.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlal.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlsl.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlsl.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlal.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vld1.u32 {d20[0]}, [r1]
+ vld1.u32 {d20[1]}, [r3]
+ vrhadd.u8 d0, d0, d20
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ mov r4, r3
+ vld1.u32 {d20[0]}, [r4], r6
+ vld1.u32 {d20[1]}, [r4]
+ vrhadd.u8 d8, d8, d20
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
new file mode 100644
index 0000000000..fa1b732466
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
@@ -0,0 +1,415 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_horiz_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlsl.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlal.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlal.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vmlsl.u8 q11, d1, d24
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ vmlsl.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlal.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ addeq r1, r1, r8
+ subeq r14, r14, #2
+ vmlal.u8 q11, d13, d28
+ vhadd.s16 q5, q5, q10
+ vmlal.u8 q11, d15, d29
+ vmlsl.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vld1.u32 {q7}, [r12], r11
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q9}, [r12], r11
+ vqrshrun.s16 d11, q11, #6
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ moveq r5, r10
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vdup.16 q10, r7
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlal.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlal.u8 q4, d5, d29
+ vmlsl.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
new file mode 100644
index 0000000000..90b2c8fef7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
@@ -0,0 +1,415 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_horiz_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlal.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlsl.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlsl.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlal.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vmlsl.u8 q11, d1, d24
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ vmlal.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlsl.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ addeq r1, r1, r8
+ subeq r14, r14, #2
+ vmlal.u8 q11, d13, d28
+ vhadd.s16 q5, q5, q10
+ vmlsl.u8 q11, d15, d29
+ vmlal.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vld1.u32 {q7}, [r12], r11
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r12], r11
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q9}, [r12], r11
+ vqrshrun.s16 d11, q11, #6
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ moveq r5, r10
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vdup.16 q10, r7
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlal.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlsl.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlsl.u8 q4, d5, d29
+ vmlal.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
new file mode 100644
index 0000000000..b312cc747c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -0,0 +1,2110 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_ports/mem.h"
+
+// Note:
+// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
+// 2. After refactoring the shared code in kernel loops with inline functions,
+// the decoder speed dropped a lot when using gcc compiler. Therefore there is
+// no refactoring for those parts by now.
+// 3. For horizontal convolve, there is an alternative optimization that
+// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
+// samples in each are read from memory: src, (src+1), (src+2), (src+3),
+// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
+// instructions. This optimization is much faster in speed unit test, but slowed
+// down the whole decoder by 5%.
+
+#if VPX_ARCH_AARCH64 && \
+ (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+ /* Shift left and insert new last column in transposed 4x4 block. */
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+ /* Shift left and insert two new columns in transposed 4x4 block. */
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+ /* Shift left and insert three new columns in transposed 4x4 block. */
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+ t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+ t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+ t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+ d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+ d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+ d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23, dd01, dd23;
+ dd01 = vdup_n_u8(0);
+ dd23 = vdup_n_u8(0);
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+ t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+ t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+ t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+ d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+ d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+ d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b0, uint8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filters);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filters);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filters);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filters);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23, dd01, dd23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filters);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filters);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filters);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filters);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+#else // !defined(__ARM_FEATURE_MATMUL_INT8)
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+ t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+ t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+ t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 =
+ convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+ d1 =
+ convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+ d2 =
+ convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+ d3 =
+ convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ uint8x16_t s0, s1, s2, s3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int32x4_t t0, t1, t2, t3;
+ int16x8_t t01, t23;
+ uint8x8_t d01, d23, dd01, dd23;
+ dd01 = vdup_n_u8(0);
+ dd23 = vdup_n_u8(0);
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+ t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+ t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+ t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
+ t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+ t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+ d01 = vqrshrun_n_s16(t01, 7);
+ d23 = vqrshrun_n_s16(t23, 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 =
+ convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+ d1 =
+ convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+ d2 =
+ convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+ d3 =
+ convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b0,
+ int8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x8_t range_limit = vdup_n_u8(128);
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filters);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filters);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filters);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filters);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x8_t range_limit = vdup_n_u8(128);
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int8x16x2_t samples_LUT;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int32x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23, dd01, dd23;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
+ d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+ dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+ d01 = vrhadd_u8(d01, dd01);
+ d23 = vrhadd_u8(d23, dd23);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filters);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filters);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filters);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filters);
+
+ load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+ d0 = vrhadd_u8(d0, dd0);
+ d1 = vrhadd_u8(d1, dd1);
+ d2 = vrhadd_u8(d2, dd2);
+ d3 = vrhadd_u8(d3, dd3);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+#endif // defined(__ARM_FEATURE_MATMUL_INT8)
+
+#else // !(VPX_ARCH_AARCH64 &&
+ // (defined(__ARM_FEATURE_DOTPROD) ||
+ // defined(__ARM_FEATURE_MATMUL_INT8)))
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+ uint8x8_t t0, t1, t2, t3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (h == 4) {
+ uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ int16x8_t tt0, tt1, tt2, tt3;
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s0 = vget_low_s16(tt0);
+ s1 = vget_low_s16(tt1);
+ s2 = vget_low_s16(tt2);
+ s3 = vget_low_s16(tt3);
+ s4 = vget_high_s16(tt0);
+ s5 = vget_high_s16(tt1);
+ s6 = vget_high_s16(tt2);
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ src += 7;
+
+ do {
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s7 = vget_low_s16(tt0);
+ s8 = vget_low_s16(tt1);
+ s9 = vget_low_s16(tt2);
+ s10 = vget_low_s16(tt3);
+
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+ transpose_u8_4x4(&d01, &d23);
+
+ vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
+ vreinterpret_u32_u8(d01), 0);
+ vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
+ vreinterpret_u32_u8(d23), 0);
+ vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
+ vreinterpret_u32_u8(d01), 1);
+ vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
+ vreinterpret_u32_u8(d23), 1);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w != 0);
+ } else {
+ int width;
+ const uint8_t *s;
+ uint8x8_t t4, t5, t6, t7;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ if (w == 4) {
+ do {
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
+ src += 8 * src_stride;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+ transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
+ dst += dst_stride;
+ h -= 8;
+ } while (h > 0);
+ } else {
+ uint8_t *d;
+ int16x8_t s11, s12, s13, s14;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ width = w;
+ s = src + 7;
+ d = dst;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+ }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+ uint8x8_t t0, t1, t2, t3;
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ src -= 3;
+
+ if (h == 4) {
+ uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ int16x8_t tt0, tt1, tt2, tt3;
+ uint32x4_t d0123 = vdupq_n_u32(0);
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s0 = vget_low_s16(tt0);
+ s1 = vget_low_s16(tt1);
+ s2 = vget_low_s16(tt2);
+ s3 = vget_low_s16(tt3);
+ s4 = vget_high_s16(tt0);
+ s5 = vget_high_s16(tt1);
+ s6 = vget_high_s16(tt2);
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ src += 7;
+
+ do {
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s7 = vget_low_s16(tt0);
+ s8 = vget_low_s16(tt1);
+ s9 = vget_low_s16(tt2);
+ s10 = vget_low_s16(tt3);
+
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+ transpose_u8_4x4(&d01, &d23);
+
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+ d0123 = vreinterpretq_u32_u8(
+ vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+
+ vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
+ vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
+ vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
+ vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w != 0);
+ } else {
+ int width;
+ const uint8_t *s;
+ uint8x8_t t4, t5, t6, t7;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ if (w == 4) {
+ uint32x4_t d0415 = vdupq_n_u32(0);
+ uint32x4_t d2637 = vdupq_n_u32(0);
+ do {
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
+ src += 8 * src_stride;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+ transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
+ d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
+ d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
+ d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
+ d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
+ d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
+ d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
+ d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
+ d0415 = vreinterpretq_u32_u8(
+ vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
+ d2637 = vreinterpretq_u32_u8(
+ vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
+
+ vst1q_lane_u32((uint32_t *)dst, d0415, 0);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0415, 2);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d2637, 0);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d2637, 2);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0415, 1);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0415, 3);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d2637, 1);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d2637, 3);
+ dst += dst_stride;
+ h -= 8;
+ } while (h > 0);
+ } else {
+ uint8_t *d;
+ int16x8_t s11, s12, s13, s14;
+ uint8x16_t d01, d23, d45, d67;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ width = w;
+ s = src + 7;
+ d = dst;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
+ vld1_u8(d + 1 * dst_stride));
+ d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
+ vld1_u8(d + 3 * dst_stride));
+ d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
+ vld1_u8(d + 5 * dst_stride));
+ d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
+ vld1_u8(d + 7 * dst_stride));
+ d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
+ d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
+ d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
+ d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
+
+ store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
+ vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
+ vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+ }
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+ s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+
+ do {
+ s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+ dst += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t t0, t1, t2, t3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ s = src;
+ s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ d = dst;
+ height = h;
+
+ do {
+ s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ vst1_u8(d, t0);
+ d += dst_stride;
+ vst1_u8(d, t1);
+ d += dst_stride;
+ vst1_u8(d, t2);
+ d += dst_stride;
+ vst1_u8(d, t3);
+ d += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+ assert(!((intptr_t)dst & 3));
+ assert(!(dst_stride & 3));
+ assert(y_step_q4 == 16);
+
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ uint32x4_t d0123 = vdupq_n_u32(0);
+
+ s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+
+ do {
+ s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
+ d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+ d0123 = vreinterpretq_u32_u8(
+ vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+
+ vst1q_lane_u32((uint32_t *)dst, d0123, 0);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0123, 1);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0123, 2);
+ dst += dst_stride;
+ vst1q_lane_u32((uint32_t *)dst, d0123, 3);
+ dst += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t t0, t1, t2, t3;
+ uint8x16_t d01, d23, dd01, dd23;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ s = src;
+ s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ d = dst;
+ height = h;
+
+ do {
+ s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+ d01 = vcombine_u8(t0, t1);
+ d23 = vcombine_u8(t2, t3);
+ dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
+ vld1_u8(d + 1 * dst_stride));
+ dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
+ vld1_u8(d + 3 * dst_stride));
+ dd01 = vrhaddq_u8(dd01, d01);
+ dd23 = vrhaddq_u8(dd23, d23);
+
+ vst1_u8(d, vget_low_u8(dd01));
+ d += dst_stride;
+ vst1_u8(d, vget_high_u8(dd01));
+ d += dst_stride;
+ vst1_u8(d, vget_low_u8(dd23));
+ d += dst_stride;
+ vst1_u8(d, vget_high_u8(dd23));
+ d += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+#endif // #if VPX_ARCH_AARCH64 &&
+ // (defined(__ARM_FEATURE_DOTPROD) ||
+ // defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
new file mode 100644
index 0000000000..07cf8242d3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
+#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+ const int8x16_t samples_hi,
+ const int32x4_t correction,
+ const int8x8_t filters) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
+ sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x2_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[2];
+ int32x4_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+ sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+ const int8x16_t samples0_hi,
+ const int8x16_t samples1_lo,
+ const int8x16_t samples1_hi,
+ const int32x4_t correction,
+ const int8x8_t filters) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0);
+ sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0);
+ sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+ sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+ sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
+
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+ const uint8x16_t samples_hi,
+ const int8x8_t filters) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum;
+
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+ sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ uint8x16_t permuted_samples[2];
+ int32x4_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+ /* Narrowing and packing is performed by the caller. */
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+ const uint8x16_t samples0_hi,
+ const uint8x16_t samples1_lo,
+ const uint8x16_t samples1_hi,
+ const int8x8_t filters) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x3_t permute_tbl) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filters) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int16x4_t sum;
+
+ sum = vmul_lane_s16(s0, filters_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filters_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filters_lo, 2);
+ sum = vmla_lane_s16(sum, s5, filters_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filters_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filters_hi, 3);
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0));
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filters) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int16x8_t sum;
+
+ sum = vmulq_lane_s16(s0, filters_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
+ return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+ const int16x8_t filters) {
+ int16x8_t ss[8];
+
+ ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+ ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+ ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+ ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+ ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+ ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+ ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+ ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+ return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
+ filters);
+}
+
+#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
new file mode 100644
index 0000000000..c4177c5385
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon_asm.h"
+
+/* Type1 and Type2 functions are called depending on the position of the
+ * negative and positive coefficients in the filter. In type1, the filter kernel
+ * used is sub_pel_filters_8lp, in which only the first two and the last two
+ * coefficients are negative. In type2, the negative coefficients are 0, 2, 5 &
+ * 7.
+ */
+
+#define DEFINE_FILTER(dir) \
+ void vpx_convolve8_##dir##_neon( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
+ if (filter == vp9_filter_kernels[1]) { \
+ vpx_convolve8_##dir##_filter_type1_neon( \
+ src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h); \
+ } else { \
+ vpx_convolve8_##dir##_filter_type2_neon( \
+ src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h); \
+ } \
+ }
+
+DEFINE_FILTER(horiz)
+DEFINE_FILTER(avg_horiz)
+DEFINE_FILTER(vert)
+DEFINE_FILTER(avg_vert)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
new file mode 100644
index 0000000000..f1c7d62ed0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+
+#define DECLARE_FILTER(dir, type) \
+ void vpx_convolve8_##dir##_filter_##type##_neon( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+DECLARE_FILTER(horiz, type1)
+DECLARE_FILTER(avg_horiz, type1)
+DECLARE_FILTER(horiz, type2)
+DECLARE_FILTER(avg_horiz, type2)
+DECLARE_FILTER(vert, type1)
+DECLARE_FILTER(avg_vert, type1)
+DECLARE_FILTER(vert, type2)
+DECLARE_FILTER(avg_vert, type2)
+
+#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
new file mode 100644
index 0000000000..2666d4253e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
@@ -0,0 +1,457 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_vert_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r3, r3, r2
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ vmlal.u8 q6, d7, d27
+ vmlsl.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlsl.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ addle r1, r1, r9
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlsl.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ add r10, r10, r2 ; 11*strd
+ vmlal.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlal.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ vmlsl.u8 q6, d16, d28
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlsl.u8 q6, d17, d29
+ add r10, r10, r2 ;12*strd
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ subs r7, r7, #4
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vst1.8 {d14}, [r14], r6
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vmlal.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlal.u8 q6, d7, d27
+ vmlsl.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlal.u8 q7, d16, d27
+ vmlsl.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from
+ ; sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlsl.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlsl.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlal.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlal.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlsl.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
new file mode 100644
index 0000000000..cb5d6d3fe5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
@@ -0,0 +1,455 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_vert_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r3, r3, r2
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ vmlsl.u8 q6, d7, d27
+ vmlal.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlal.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ addle r1, r1, r9
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlal.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ add r10, r10, r2 ; 11*strd
+ vmlsl.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlsl.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ vmlal.u8 q6, d16, d28
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlsl.u8 q6, d17, d29
+ add r10, r10, r2 ;12*strd
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ subs r7, r7, #4
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vst1.8 {d14}, [r14], r6
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vmlsl.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlsl.u8 q6, d7, d27
+ vmlal.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlsl.u8 q7, d16, d27
+ vmlal.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlal.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlal.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlsl.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlsl.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlal.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
new file mode 100644
index 0000000000..8e3ee599f4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4, int x_step_q4,
+ int y0_q4, int y_step_q4, int w, int h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ if (w < 8) { // avg4
+ uint8x8_t s0, s1;
+ uint8x8_t dd0 = vdup_n_u8(0);
+ uint32x2x2_t s01;
+ do {
+ s0 = vld1_u8(src);
+ src += src_stride;
+ s1 = vld1_u8(src);
+ src += src_stride;
+ s01 = vzip_u32(vreinterpret_u32_u8(s0), vreinterpret_u32_u8(s1));
+ dd0 = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)dst, vreinterpret_u32_u8(dd0), 0));
+ dd0 = vreinterpret_u8_u32(vld1_lane_u32(
+ (const uint32_t *)(dst + dst_stride), vreinterpret_u32_u8(dd0), 1));
+ dd0 = vrhadd_u8(vreinterpret_u8_u32(s01.val[0]), dd0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) { // avg8
+ uint8x8_t s0, s1, d0, d1;
+ uint8x16_t s01, d01;
+ do {
+ s0 = vld1_u8(src);
+ src += src_stride;
+ s1 = vld1_u8(src);
+ src += src_stride;
+ d0 = vld1_u8(dst);
+ d1 = vld1_u8(dst + dst_stride);
+
+ s01 = vcombine_u8(s0, s1);
+ d01 = vcombine_u8(d0, d1);
+ d01 = vrhaddq_u8(s01, d01);
+
+ vst1_u8(dst, vget_low_u8(d01));
+ dst += dst_stride;
+ vst1_u8(dst, vget_high_u8(d01));
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w < 32) { // avg16
+ uint8x16_t s0, s1, d0, d1;
+ do {
+ s0 = vld1q_u8(src);
+ src += src_stride;
+ s1 = vld1q_u8(src);
+ src += src_stride;
+ d0 = vld1q_u8(dst);
+ d1 = vld1q_u8(dst + dst_stride);
+
+ d0 = vrhaddq_u8(s0, d0);
+ d1 = vrhaddq_u8(s1, d1);
+
+ vst1q_u8(dst, d0);
+ dst += dst_stride;
+ vst1q_u8(dst, d1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 32) { // avg32
+ uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
+ do {
+ s0 = vld1q_u8(src);
+ s1 = vld1q_u8(src + 16);
+ src += src_stride;
+ s2 = vld1q_u8(src);
+ s3 = vld1q_u8(src + 16);
+ src += src_stride;
+ d0 = vld1q_u8(dst);
+ d1 = vld1q_u8(dst + 16);
+ d2 = vld1q_u8(dst + dst_stride);
+ d3 = vld1q_u8(dst + dst_stride + 16);
+
+ d0 = vrhaddq_u8(s0, d0);
+ d1 = vrhaddq_u8(s1, d1);
+ d2 = vrhaddq_u8(s2, d2);
+ d3 = vrhaddq_u8(s3, d3);
+
+ vst1q_u8(dst, d0);
+ vst1q_u8(dst + 16, d1);
+ dst += dst_stride;
+ vst1q_u8(dst, d2);
+ vst1q_u8(dst + 16, d3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else { // avg64
+ uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
+ do {
+ s0 = vld1q_u8(src);
+ s1 = vld1q_u8(src + 16);
+ s2 = vld1q_u8(src + 32);
+ s3 = vld1q_u8(src + 48);
+ src += src_stride;
+ d0 = vld1q_u8(dst);
+ d1 = vld1q_u8(dst + 16);
+ d2 = vld1q_u8(dst + 32);
+ d3 = vld1q_u8(dst + 48);
+
+ d0 = vrhaddq_u8(s0, d0);
+ d1 = vrhaddq_u8(s1, d1);
+ d2 = vrhaddq_u8(s2, d2);
+ d3 = vrhaddq_u8(s3, d3);
+
+ vst1q_u8(dst, d0);
+ vst1q_u8(dst + 16, d1);
+ vst1q_u8(dst + 32, d2);
+ vst1q_u8(dst + 48, d3);
+ dst += dst_stride;
+ } while (--h);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
new file mode 100644
index 0000000000..efd6574f1f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -0,0 +1,116 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_convolve_avg_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve_avg_neon| PROC
+ push {r4-r6, lr}
+ ldrd r4, r5, [sp, #36]
+ mov r6, r2
+
+ cmp r4, #32
+ bgt avg64
+ beq avg32
+ cmp r4, #8
+ bgt avg16
+ beq avg8
+ b avg4
+
+avg64
+ sub lr, r1, #32
+ sub r4, r3, #32
+avg64_h
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0]!
+ vld1.8 {q2-q3}, [r0], lr
+ pld [r2, r3]
+ vld1.8 {q8-q9}, [r6@128]!
+ vld1.8 {q10-q11}, [r6@128], r4
+ vrhadd.u8 q0, q0, q8
+ vrhadd.u8 q1, q1, q9
+ vrhadd.u8 q2, q2, q10
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q0-q1}, [r2@128]!
+ vst1.8 {q2-q3}, [r2@128], r4
+ subs r5, r5, #1
+ bgt avg64_h
+ pop {r4-r6, pc}
+
+avg32
+ vld1.8 {q0-q1}, [r0], r1
+ vld1.8 {q2-q3}, [r0], r1
+ vld1.8 {q8-q9}, [r6@128], r3
+ vld1.8 {q10-q11}, [r6@128], r3
+ pld [r0]
+ vrhadd.u8 q0, q0, q8
+ pld [r0, r1]
+ vrhadd.u8 q1, q1, q9
+ pld [r6]
+ vrhadd.u8 q2, q2, q10
+ pld [r6, r3]
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q0-q1}, [r2@128], r3
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #2
+ bgt avg32
+ pop {r4-r6, pc}
+
+avg16
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q1}, [r0], r1
+ vld1.8 {q2}, [r6@128], r3
+ vld1.8 {q3}, [r6@128], r3
+ pld [r0]
+ pld [r0, r1]
+ vrhadd.u8 q0, q0, q2
+ pld [r6]
+ pld [r6, r3]
+ vrhadd.u8 q1, q1, q3
+ vst1.8 {q0}, [r2@128], r3
+ vst1.8 {q1}, [r2@128], r3
+ subs r5, r5, #2
+ bgt avg16
+ pop {r4-r6, pc}
+
+avg8
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d2}, [r6@64], r3
+ vld1.8 {d3}, [r6@64], r3
+ pld [r0]
+ pld [r0, r1]
+ vrhadd.u8 q0, q0, q1
+ pld [r6]
+ pld [r6, r3]
+ vst1.8 {d0}, [r2@64], r3
+ vst1.8 {d1}, [r2@64], r3
+ subs r5, r5, #2
+ bgt avg8
+ pop {r4-r6, pc}
+
+avg4
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d2[0]}, [r6@32], r3
+ vld1.32 {d2[1]}, [r6@32], r3
+ vrhadd.u8 d0, d0, d2
+ vst1.32 {d0[0]}, [r2@32], r3
+ vst1.32 {d0[1]}, [r2@32], r3
+ subs r5, r5, #2
+ bgt avg4
+ pop {r4-r6, pc}
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
new file mode 100644
index 0000000000..bea7c98437
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <string.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ (void)filter;
+ (void)x0_q4;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ if (w < 8) { // copy4
+ do {
+ memcpy(dst, src, 4);
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 4);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) { // copy8
+ uint8x8_t s0, s1;
+ do {
+ s0 = vld1_u8(src);
+ src += src_stride;
+ s1 = vld1_u8(src);
+ src += src_stride;
+
+ vst1_u8(dst, s0);
+ dst += dst_stride;
+ vst1_u8(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w < 32) { // copy16
+ uint8x16_t s0, s1;
+ do {
+ s0 = vld1q_u8(src);
+ src += src_stride;
+ s1 = vld1q_u8(src);
+ src += src_stride;
+
+ vst1q_u8(dst, s0);
+ dst += dst_stride;
+ vst1q_u8(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 32) { // copy32
+ uint8x16_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u8(src);
+ s1 = vld1q_u8(src + 16);
+ src += src_stride;
+ s2 = vld1q_u8(src);
+ s3 = vld1q_u8(src + 16);
+ src += src_stride;
+
+ vst1q_u8(dst, s0);
+ vst1q_u8(dst + 16, s1);
+ dst += dst_stride;
+ vst1q_u8(dst, s2);
+ vst1q_u8(dst + 16, s3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else { // copy64
+ uint8x16_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u8(src);
+ s1 = vld1q_u8(src + 16);
+ s2 = vld1q_u8(src + 32);
+ s3 = vld1q_u8(src + 48);
+ src += src_stride;
+
+ vst1q_u8(dst, s0);
+ vst1q_u8(dst + 16, s1);
+ vst1q_u8(dst + 32, s2);
+ vst1q_u8(dst + 48, s3);
+ dst += dst_stride;
+ } while (--h);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
new file mode 100644
index 0000000000..7a66e3ce2f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -0,0 +1,84 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vpx_convolve_copy_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve_copy_neon| PROC
+ push {r4-r5, lr}
+ ldrd r4, r5, [sp, #32]
+
+ cmp r4, #32
+ bgt copy64
+ beq copy32
+ cmp r4, #8
+ bgt copy16
+ beq copy8
+ b copy4
+
+copy64
+ sub lr, r1, #32
+ sub r3, r3, #32
+copy64_h
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0]!
+ vld1.8 {q2-q3}, [r0], lr
+ vst1.8 {q0-q1}, [r2@128]!
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #1
+ bgt copy64_h
+ pop {r4-r5, pc}
+
+copy32
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0-q1}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {q2-q3}, [r0], r1
+ vst1.8 {q0-q1}, [r2@128], r3
+ vst1.8 {q2-q3}, [r2@128], r3
+ subs r5, r5, #2
+ bgt copy32
+ pop {r4-r5, pc}
+
+copy16
+ pld [r0, r1, lsl #1]
+ vld1.8 {q0}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {q1}, [r0], r1
+ vst1.8 {q0}, [r2@128], r3
+ vst1.8 {q1}, [r2@128], r3
+ subs r5, r5, #2
+ bgt copy16
+ pop {r4-r5, pc}
+
+copy8
+ pld [r0, r1, lsl #1]
+ vld1.8 {d0}, [r0], r1
+ pld [r0, r1, lsl #1]
+ vld1.8 {d2}, [r0], r1
+ vst1.8 {d0}, [r2@64], r3
+ vst1.8 {d2}, [r2@64], r3
+ subs r5, r5, #2
+ bgt copy8
+ pop {r4-r5, pc}
+
+copy4
+ ldr r12, [r0], r1
+ str r12, [r2], r3
+ subs r5, r5, #1
+ bgt copy4
+ pop {r4-r5, pc}
+ ENDP
+
+ END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
new file mode 100644
index 0000000000..830f3176d7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+ * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+ */
+ uint8_t temp[64 * 72];
+
+ // Account for the vertical phase needing 3 lines prior and 4 lines post
+ // (+ 1 to make it divisible by 4).
+ const int intermediate_height = h + 8;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ /* Filter starting 3 lines back. The neon implementation will ignore the given
+ * height and filter a multiple of 4 lines. Since this goes in to the temp
+ * buffer which has lots of extra room and is subsequently discarded this is
+ * safe if somewhat less than ideal. */
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height);
+
+ /* Step into the temp buffer 3 lines to get the actual frame data */
+ vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ uint8_t temp[64 * 72];
+ const int intermediate_height = h + 8;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ /* This implementation has the same issues as above. In addition, we only want
+ * to average the values after both passes.
+ */
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+ intermediate_height);
+ vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
new file mode 100644
index 0000000000..b8e3c5e540
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void scaledconvolve_horiz_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4, const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int x, y, z;
+
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ y = h;
+ do {
+ int x_q4 = x0_q4;
+ x = 0;
+ do {
+ // process 4 src_x steps
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ if (x_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ int16x8_t ss[4];
+ int16x4_t t[8], tt;
+
+ load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+ transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+ ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+ ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+ ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+ ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+ t[0] = vget_low_s16(ss[0]);
+ t[1] = vget_low_s16(ss[1]);
+ t[2] = vget_low_s16(ss[2]);
+ t[3] = vget_low_s16(ss[3]);
+ t[4] = vget_high_s16(ss[0]);
+ t[5] = vget_high_s16(ss[1]);
+ t[6] = vget_high_s16(ss[2]);
+ t[7] = vget_high_s16(ss[3]);
+
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+ filters);
+ d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+ vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+ } else {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 4x4 filters values back to dst
+ {
+ const uint8x8x4_t d4 = vld4_u8(temp);
+ vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
+ vreinterpret_u32_u8(d4.val[0]), 0);
+ vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
+ vreinterpret_u32_u8(d4.val[1]), 0);
+ vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
+ vreinterpret_u32_u8(d4.val[2]), 0);
+ vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
+ vreinterpret_u32_u8(d4.val[3]), 0);
+ }
+ x += 4;
+ } while (x < w);
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ y -= 4;
+ } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4, const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = (h + 7) & ~7;
+
+ do {
+ int x_q4 = x0_q4;
+ x = 0;
+ do {
+ uint8x8_t d[8];
+ // process 8 src_x steps
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+ if (x_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8];
+ load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+ &s[5], &s[6], &s[7]);
+ transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ d[0] = scale_filter_8(s, filters);
+ vst1_u8(&temp[8 * z], d[0]);
+ } else {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 8x8 filters values back to dst
+ load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+ &d[7]);
+ transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ vst1_u8(&dst[x + 0 * dst_stride], d[0]);
+ vst1_u8(&dst[x + 1 * dst_stride], d[1]);
+ vst1_u8(&dst[x + 2 * dst_stride], d[2]);
+ vst1_u8(&dst[x + 3 * dst_stride], d[3]);
+ vst1_u8(&dst[x + 4 * dst_stride], d[4]);
+ vst1_u8(&dst[x + 5 * dst_stride], d[5]);
+ vst1_u8(&dst[x + 6 * dst_stride], d[6]);
+ vst1_u8(&dst[x + 7 * dst_stride], d[7]);
+ x += 8;
+ } while (x < w);
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ int16x4_t t[8], tt;
+
+ load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+ t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+ t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+ t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+ t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+ t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+ t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+ t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
+ d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x8_t s[8], d;
+ load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ d = scale_filter_8(s, filters);
+ vst1_u8(dst, d);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int x, y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ y = h;
+ do {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ if (y_q4 & SUBPEL_MASK) {
+ x = 0;
+ do {
+ const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+ uint8x16_t ss[8];
+ uint8x8_t s[8], d[2];
+ load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+ &ss[5], &ss[6], &ss[7]);
+ s[0] = vget_low_u8(ss[0]);
+ s[1] = vget_low_u8(ss[1]);
+ s[2] = vget_low_u8(ss[2]);
+ s[3] = vget_low_u8(ss[3]);
+ s[4] = vget_low_u8(ss[4]);
+ s[5] = vget_low_u8(ss[5]);
+ s[6] = vget_low_u8(ss[6]);
+ s[7] = vget_low_u8(ss[7]);
+ d[0] = scale_filter_8(s, filters);
+
+ s[0] = vget_high_u8(ss[0]);
+ s[1] = vget_high_u8(ss[1]);
+ s[2] = vget_high_u8(ss[2]);
+ s[3] = vget_high_u8(ss[3]);
+ s[4] = vget_high_u8(ss[4]);
+ s[5] = vget_high_u8(ss[5]);
+ s[6] = vget_high_u8(ss[6]);
+ s[7] = vget_high_u8(ss[7]);
+ d[1] = scale_filter_8(s, filters);
+ vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+ src_y += 16;
+ x += 16;
+ } while (x < w);
+ } else {
+ memcpy(dst, &src_y[3 * src_stride], w);
+ }
+ dst += dst_stride;
+ y_q4 += y_step_q4;
+ } while (--y);
+}
+
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if (w >= 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ }
+}