summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/arm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/aom/aom_dsp/arm
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/aom/aom_dsp/arm')
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve8_neon.c349
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c460
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c408
-rw-r--r--third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c154
-rw-r--r--third_party/aom/aom_dsp/arm/avg_neon.c309
-rw-r--r--third_party/aom/aom_dsp/arm/avg_pred_neon.c221
-rw-r--r--third_party/aom/aom_dsp/arm/avg_sve.c62
-rw-r--r--third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c492
-rw-r--r--third_party/aom/aom_dsp/arm/blend_neon.h125
-rw-r--r--third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c124
-rw-r--r--third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c106
-rw-r--r--third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h65
-rw-r--r--third_party/aom/aom_dsp/arm/dot_sve.h42
-rw-r--r--third_party/aom/aom_dsp/arm/fwd_txfm_neon.c304
-rw-r--r--third_party/aom/aom_dsp/arm/hadamard_neon.c325
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_avg_neon.c125
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c190
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c97
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c473
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c105
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c363
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c213
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c2730
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c1265
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c354
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c211
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c369
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_quantize_neon.c431
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_sad_neon.c509
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c617
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_sse_neon.c284
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_sse_sve.c215
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c1497
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_variance_neon.c502
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c92
-rw-r--r--third_party/aom/aom_dsp/arm/highbd_variance_sve.c430
-rw-r--r--third_party/aom/aom_dsp/arm/intrapred_neon.c3110
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_neon.c1045
-rw-r--r--third_party/aom/aom_dsp/arm/masked_sad4d_neon.c562
-rw-r--r--third_party/aom/aom_dsp/arm/masked_sad_neon.c244
-rw-r--r--third_party/aom/aom_dsp/arm/mem_neon.h1253
-rw-r--r--third_party/aom/aom_dsp/arm/obmc_sad_neon.c250
-rw-r--r--third_party/aom/aom_dsp/arm/obmc_variance_neon.c290
-rw-r--r--third_party/aom/aom_dsp/arm/reinterpret_neon.h33
-rw-r--r--third_party/aom/aom_dsp/arm/sad_neon.c873
-rw-r--r--third_party/aom/aom_dsp/arm/sad_neon_dotprod.c530
-rw-r--r--third_party/aom/aom_dsp/arm/sadxd_neon.c514
-rw-r--r--third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c289
-rw-r--r--third_party/aom/aom_dsp/arm/sse_neon.c210
-rw-r--r--third_party/aom/aom_dsp/arm/sse_neon_dotprod.c223
-rw-r--r--third_party/aom/aom_dsp/arm/subpel_variance_neon.c1103
-rw-r--r--third_party/aom/aom_dsp/arm/subtract_neon.c166
-rw-r--r--third_party/aom/aom_dsp/arm/sum_neon.h311
-rw-r--r--third_party/aom/aom_dsp/arm/sum_squares_neon.c574
-rw-r--r--third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c154
-rw-r--r--third_party/aom/aom_dsp/arm/sum_squares_sve.c402
-rw-r--r--third_party/aom/aom_dsp/arm/transpose_neon.h1263
-rw-r--r--third_party/aom/aom_dsp/arm/variance_neon.c470
-rw-r--r--third_party/aom/aom_dsp/arm/variance_neon_dotprod.c314
59 files changed, 28801 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
new file mode 100644
index 0000000000..7441108b01
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+ int16x4_t sum;
+
+ sum = vmul_lane_s16(s0, filter_lo, 0);
+ sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+ sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+ sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filter) {
+ const int16x4_t filter_lo = vget_low_s16(filter);
+ const int16x4_t filter_hi = vget_high_s16(filter);
+ int16x8_t sum;
+
+ sum = vmulq_lane_s16(s0, filter_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+ sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filter = vld1q_s16(filter_x);
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_y;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1);
+
+ if (h == 4) {
+ uint8x8_t t0, t1, t2, t3, d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+ s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+ src += 7;
+
+ do {
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ transpose_elems_inplace_u8_4x4(&d01, &d23);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01);
+ store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w != 0);
+ } else {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, d0, d1, d2, d3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ if (w == 4) {
+ do {
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
+ transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+ &t3);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+
+ transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, 4 * dst_stride, d0);
+ store_u8x4_strided_x2(dst + 1 * dst_stride, 4 * dst_stride, d1);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, 4 * dst_stride, d2);
+ store_u8x4_strided_x2(dst + 3 * dst_stride, 4 * dst_stride, d3);
+
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ } else {
+ uint8x8_t d4, d5, d6, d7;
+ int16x8_t s11, s12, s13, s14;
+ int width;
+ const uint8_t *s;
+ uint8_t *d;
+
+ do {
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ width = w;
+ s = src + 7;
+ d = dst;
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+ d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+ d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+ d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
+
+ transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6,
+ &d7);
+
+ store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+ }
+}
+
+void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const int16x8_t filter = vld1q_s16(filter_y);
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)filter_x;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+ if (w == 4) {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+ s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+ s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+
+ src += 7 * src_stride;
+
+ do {
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+
+ do {
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ height = h;
+ s = src + 7 * src_stride;
+ d = dst;
+
+ do {
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c
new file mode 100644
index 0000000000..ac0a6efd00
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+ /* Shift left and insert new last column in transposed 4x4 block. */
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+ /* Shift left and insert two new columns in transposed 4x4 block. */
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+ /* Shift left and insert three new columns in transposed 4x4 block. */
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x2_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[2];
+ int32x4_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+ sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+ const int8x8_t filter,
+ const int32x4_t correction,
+ const uint8x16_t range_limit,
+ const uint8x16x3_t permute_tbl) {
+ int8x16_t clamped_samples, permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+ sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
+ sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x16_t range_limit = vdupq_n_u8(128);
+ uint8x16_t s0, s1, s2, s3;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_y;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1);
+
+ if (w == 4) {
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int16x4_t t0, t1, t2, t3;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+ t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+ t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+ t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+ d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+ d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+ d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+ int8x8_t a3, int8x16_t *b0,
+ int8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+ *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+ const int8x16_t samples_hi,
+ const int32x4_t correction,
+ const int8x8_t filter) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
+ sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+ const int8x16_t samples0_hi,
+ const int8x16_t samples1_lo,
+ const int8x16_t samples1_hi,
+ const int32x4_t correction,
+ const int8x8_t filter) {
+ /* Sample range-clamping and permutation are performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Accumulate dot product into 'correction' to account for range clamp. */
+ /* First 4 output values. */
+ sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
+ sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
+ /* Second 4 output values. */
+ sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
+ sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+ const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
+ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+ const uint8x8_t range_limit = vdup_n_u8(128);
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+ int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int8x16x2_t samples_LUT;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)filter_x;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int16x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ src += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
+ d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
+ d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
+ d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+ s += 7 * src_stride;
+
+ /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+ s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+ s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+ s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+ s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+ s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+ s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+ s7 = vdup_n_s8(0);
+ s8 = vdup_n_s8(0);
+ s9 = vdup_n_s8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ uint8x8_t t7, t8, t9, t10;
+
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+ s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+ s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+ s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+ s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ correction, filter);
+ d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ correction, filter);
+ d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ correction, filter);
+ d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ correction, filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c
new file mode 100644
index 0000000000..c314c0a192
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+ /* Shift left and insert new last column in transposed 4x4 block. */
+ 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+ /* Shift left and insert two new columns in transposed 4x4 block. */
+ 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+ /* Shift left and insert three new columns in transposed 4x4 block. */
+ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples,
+ const int8x8_t filter,
+ const uint8x16x2_t permute_tbl) {
+ uint8x16_t permuted_samples[2];
+ int32x4_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+ sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples,
+ const int8x8_t filter,
+ const uint8x16x3_t permute_tbl) {
+ uint8x16_t permuted_samples[3];
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* Permute samples ready for dot product. */
+ /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
+ permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
+ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+ /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+ permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+ sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
+ sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+ uint8x16_t s0, s1, s2, s3;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)x_step_q4;
+ (void)filter_y;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1);
+
+ if (w == 4) {
+ const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ int16x4_t t0, t1, t2, t3;
+ uint8x8_t d01, d23;
+
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ t0 = convolve8_4_usdot(s0, filter, perm_tbl);
+ t1 = convolve8_4_usdot(s1, filter, perm_tbl);
+ t2 = convolve8_4_usdot(s2, filter, perm_tbl);
+ t3 = convolve8_4_usdot(s3, filter, perm_tbl);
+ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+ const uint8_t *s;
+ uint8_t *d;
+ int width;
+ uint8x8_t d0, d1, d2, d3;
+
+ do {
+ width = w;
+ s = src;
+ d = dst;
+ do {
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+ d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+ d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+ d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b,
+ const uint8x16_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, XX, XX, XX, XX
+ * a1: 10, 11, 12, 13, XX, XX, XX, XX
+ * a2: 20, 21, 22, 23, XX, XX, XX, XX
+ * a3: 30, 31, 32, 33, XX, XX, XX, XX
+ *
+ * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x16_t *b0, uint8x16_t *b1,
+ const uint8x16x2_t permute_tbl) {
+ /* Transpose 8-bit elements and concatenate result rows as follows:
+ * a0: 00, 01, 02, 03, 04, 05, 06, 07
+ * a1: 10, 11, 12, 13, 14, 15, 16, 17
+ * a2: 20, 21, 22, 23, 24, 25, 26, 27
+ * a3: 30, 31, 32, 33, 34, 35, 36, 37
+ *
+ * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+ *
+ * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+ * as an argument is preferable to loading it directly from memory as this
+ * inline helper is called many times from the same parent function.
+ */
+
+ uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+ *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+ *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+ const uint8x16_t samples_hi,
+ const int8x8_t filter) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum;
+
+ sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
+ sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
+
+ /* Further narrowing and packing is performed by the caller. */
+ return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+ const uint8x16_t samples0_hi,
+ const uint8x16_t samples1_lo,
+ const uint8x16_t samples1_hi,
+ const int8x8_t filter) {
+ /* Sample permutation is performed by the caller. */
+ int32x4_t sum0, sum1;
+ int16x8_t sum;
+
+ /* First 4 output values. */
+ sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
+ sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
+ /* Second 4 output values. */
+ sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
+ sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+ /* Narrow and re-pack. */
+ sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w,
+ int h) {
+ const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+ const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ uint8x16x2_t samples_LUT;
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ (void)filter_x;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+ if (w == 4) {
+ const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+ int16x4_t d0, d1, d2, d3;
+ uint8x8_t d01, d23;
+
+ load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ src += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+ transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+ transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+ do {
+ load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456;
+ samples_LUT.val[1] = s78910;
+ s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
+ d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
+ d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
+ d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123 = s4567;
+ s1234 = s5678;
+ s2345 = s6789;
+ s3456 = s78910;
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+ s6789_hi, s78910_lo, s78910_hi;
+ uint8x8_t d0, d1, d2, d3;
+ const uint8_t *s;
+ uint8_t *d;
+ int height;
+
+ do {
+ height = h;
+ s = src;
+ d = dst;
+
+ load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ s7 = vdup_n_u8(0);
+ s8 = vdup_n_u8(0);
+ s9 = vdup_n_u8(0);
+
+ /* This operation combines a conventional transpose and the sample permute
+ * (see horizontal case) required before computing the dot product.
+ */
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+ tran_concat_tbl);
+ transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+ tran_concat_tbl);
+
+ do {
+ load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+ tran_concat_tbl);
+
+ /* Merge new data into block from previous iteration. */
+ samples_LUT.val[0] = s3456_lo;
+ samples_LUT.val[1] = s78910_lo;
+ s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ samples_LUT.val[0] = s3456_hi;
+ samples_LUT.val[1] = s78910_hi;
+ s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+ filter);
+ d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+ filter);
+ d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+ filter);
+ d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+ filter);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ /* Prepare block for next iteration - re-using as much as possible. */
+ /* Shuffle everything up four rows. */
+ s0123_lo = s4567_lo;
+ s0123_hi = s4567_hi;
+ s1234_lo = s5678_lo;
+ s1234_hi = s5678_hi;
+ s2345_lo = s6789_lo;
+ s2345_hi = s6789_hi;
+ s3456_lo = s78910_lo;
+ s3456_hi = s78910_hi;
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
new file mode 100644
index 0000000000..325d6f29ff
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+ const uint8_t *src1;
+ uint8_t *dst1;
+ int y;
+
+ if (!(w & 0x0F)) {
+ for (y = 0; y < h; ++y) {
+ src1 = src;
+ dst1 = dst;
+ for (int x = 0; x < (w >> 4); ++x) {
+ vst1q_u8(dst1, vld1q_u8(src1));
+ src1 += 16;
+ dst1 += 16;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else if (!(w & 0x07)) {
+ for (y = 0; y < h; ++y) {
+ vst1_u8(dst, vld1_u8(src));
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else if (!(w & 0x03)) {
+ for (y = 0; y < h; ++y) {
+ memcpy(dst, src, sizeof(uint32_t));
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else if (!(w & 0x01)) {
+ for (y = 0; y < h; ++y) {
+ memcpy(dst, src, sizeof(uint16_t));
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride, int w,
+ int h) {
+ if (w < 8) { // copy4
+ uint16x4_t s0, s1;
+ do {
+ s0 = vld1_u16(src);
+ src += src_stride;
+ s1 = vld1_u16(src);
+ src += src_stride;
+
+ vst1_u16(dst, s0);
+ dst += dst_stride;
+ vst1_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 8) { // copy8
+ uint16x8_t s0, s1;
+ do {
+ s0 = vld1q_u16(src);
+ src += src_stride;
+ s1 = vld1q_u16(src);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ dst += dst_stride;
+ vst1q_u16(dst, s1);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w < 32) { // copy16
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ src += src_stride;
+ s2 = vld1q_u16(src);
+ s3 = vld1q_u16(src + 8);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ dst += dst_stride;
+ vst1q_u16(dst, s2);
+ vst1q_u16(dst + 8, s3);
+ dst += dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 32) { // copy32
+ uint16x8_t s0, s1, s2, s3;
+ do {
+ s0 = vld1q_u16(src);
+ s1 = vld1q_u16(src + 8);
+ s2 = vld1q_u16(src + 16);
+ s3 = vld1q_u16(src + 24);
+ src += src_stride;
+
+ vst1q_u16(dst, s0);
+ vst1q_u16(dst + 8, s1);
+ vst1q_u16(dst + 16, s2);
+ vst1q_u16(dst + 24, s3);
+ dst += dst_stride;
+ } while (--h != 0);
+ } else { // copy64
+ uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ do {
+ const uint16_t *s = src;
+ uint16_t *d = dst;
+ int width = w;
+ do {
+ s0 = vld1q_u16(s);
+ s1 = vld1q_u16(s + 8);
+ s2 = vld1q_u16(s + 16);
+ s3 = vld1q_u16(s + 24);
+ s4 = vld1q_u16(s + 32);
+ s5 = vld1q_u16(s + 40);
+ s6 = vld1q_u16(s + 48);
+ s7 = vld1q_u16(s + 56);
+
+ vst1q_u16(d, s0);
+ vst1q_u16(d + 8, s1);
+ vst1q_u16(d + 16, s2);
+ vst1q_u16(d + 24, s3);
+ vst1q_u16(d + 32, s4);
+ vst1q_u16(d + 40, s5);
+ vst1q_u16(d + 48, s6);
+ vst1q_u16(d + 56, s7);
+ s += 64;
+ d += 64;
+ width -= 64;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c
new file mode 100644
index 0000000000..2e79b2ef69
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/avg_neon.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+unsigned int aom_avg_4x4_neon(const uint8_t *p, int stride) {
+ const uint8x8_t s0 = load_unaligned_u8(p, stride);
+ const uint8x8_t s1 = load_unaligned_u8(p + 2 * stride, stride);
+
+ const uint32_t sum = horizontal_add_u16x8(vaddl_u8(s0, s1));
+ return (sum + (1 << 3)) >> 4;
+}
+
+unsigned int aom_avg_8x8_neon(const uint8_t *p, int stride) {
+ uint8x8_t s0 = vld1_u8(p);
+ p += stride;
+ uint8x8_t s1 = vld1_u8(p);
+ p += stride;
+ uint16x8_t acc = vaddl_u8(s0, s1);
+
+ int i = 0;
+ do {
+ const uint8x8_t si = vld1_u8(p);
+ p += stride;
+ acc = vaddw_u8(acc, si);
+ } while (++i < 6);
+
+ const uint32_t sum = horizontal_add_u16x8(acc);
+ return (sum + (1 << 5)) >> 6;
+}
+
+void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx,
+ int *avg) {
+ avg[0] = aom_avg_8x8_neon(s + y16_idx * p + x16_idx, p);
+ avg[1] = aom_avg_8x8_neon(s + y16_idx * p + (x16_idx + 8), p);
+ avg[2] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + x16_idx, p);
+ avg[3] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + (x16_idx + 8), p);
+}
+
+int aom_satd_lp_neon(const int16_t *coeff, int length) {
+ int16x8_t s0 = vld1q_s16(coeff);
+ int16x8_t s1 = vld1q_s16(coeff + 8);
+
+ int16x8_t abs0 = vabsq_s16(s0);
+ int16x8_t abs1 = vabsq_s16(s1);
+
+ int32x4_t acc0 = vpaddlq_s16(abs0);
+ int32x4_t acc1 = vpaddlq_s16(abs1);
+
+ length -= 16;
+ coeff += 16;
+
+ while (length != 0) {
+ s0 = vld1q_s16(coeff);
+ s1 = vld1q_s16(coeff + 8);
+
+ abs0 = vabsq_s16(s0);
+ abs1 = vabsq_s16(s1);
+
+ acc0 = vpadalq_s16(acc0, abs0);
+ acc1 = vpadalq_s16(acc1, abs1);
+
+ length -= 16;
+ coeff += 16;
+ }
+
+ int32x4_t accum = vaddq_s32(acc0, acc1);
+ return horizontal_add_s32x4(accum);
+}
+
+void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ assert(width % 16 == 0);
+ assert(height % 4 == 0);
+
+ const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+ uint16x8_t sum_lo[2], sum_hi[2];
+
+ int w = 0;
+ do {
+ const uint8_t *r = ref + w;
+ uint8x16_t r0 = vld1q_u8(r + 0 * ref_stride);
+ uint8x16_t r1 = vld1q_u8(r + 1 * ref_stride);
+ uint8x16_t r2 = vld1q_u8(r + 2 * ref_stride);
+ uint8x16_t r3 = vld1q_u8(r + 3 * ref_stride);
+
+ sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+ sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+ sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+ sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+ r += 4 * ref_stride;
+
+ for (int h = height - 4; h != 0; h -= 4) {
+ r0 = vld1q_u8(r + 0 * ref_stride);
+ r1 = vld1q_u8(r + 1 * ref_stride);
+ r2 = vld1q_u8(r + 2 * ref_stride);
+ r3 = vld1q_u8(r + 3 * ref_stride);
+
+ uint16x8_t tmp0_lo = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+ uint16x8_t tmp0_hi = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+ uint16x8_t tmp1_lo = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+ uint16x8_t tmp1_hi = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+ sum_lo[0] = vaddq_u16(sum_lo[0], tmp0_lo);
+ sum_hi[0] = vaddq_u16(sum_hi[0], tmp0_hi);
+ sum_lo[1] = vaddq_u16(sum_lo[1], tmp1_lo);
+ sum_hi[1] = vaddq_u16(sum_hi[1], tmp1_hi);
+
+ r += 4 * ref_stride;
+ }
+
+ sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+ sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
+
+ const int16x8_t avg0 =
+ vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+ const int16x8_t avg1 =
+ vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
+
+ vst1q_s16(hbuf + w, avg0);
+ vst1q_s16(hbuf + w + 8, avg1);
+ w += 16;
+ } while (w < width);
+}
+
+void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ assert(width % 16 == 0);
+ assert(height % 4 == 0);
+
+ const int16x4_t neg_norm_factor = vdup_n_s16(-norm_factor);
+ uint16x8_t sum[4];
+
+ int h = 0;
+ do {
+ sum[0] = vpaddlq_u8(vld1q_u8(ref + 0 * ref_stride));
+ sum[1] = vpaddlq_u8(vld1q_u8(ref + 1 * ref_stride));
+ sum[2] = vpaddlq_u8(vld1q_u8(ref + 2 * ref_stride));
+ sum[3] = vpaddlq_u8(vld1q_u8(ref + 3 * ref_stride));
+
+ for (int w = 16; w < width; w += 16) {
+ sum[0] = vpadalq_u8(sum[0], vld1q_u8(ref + 0 * ref_stride + w));
+ sum[1] = vpadalq_u8(sum[1], vld1q_u8(ref + 1 * ref_stride + w));
+ sum[2] = vpadalq_u8(sum[2], vld1q_u8(ref + 2 * ref_stride + w));
+ sum[3] = vpadalq_u8(sum[3], vld1q_u8(ref + 3 * ref_stride + w));
+ }
+
+ uint16x4_t sum_4d = vmovn_u32(horizontal_add_4d_u16x8(sum));
+ int16x4_t avg = vshl_s16(vreinterpret_s16_u16(sum_4d), neg_norm_factor);
+ vst1_s16(vbuf + h, avg);
+
+ ref += 4 * ref_stride;
+ h += 4;
+ } while (h < height);
+}
+
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
+int aom_satd_neon(const tran_low_t *coeff, int length) {
+ const int32x4_t zero = vdupq_n_s32(0);
+
+ int32x4_t s0 = vld1q_s32(&coeff[0]);
+ int32x4_t s1 = vld1q_s32(&coeff[4]);
+ int32x4_t s2 = vld1q_s32(&coeff[8]);
+ int32x4_t s3 = vld1q_s32(&coeff[12]);
+
+ int32x4_t accum0 = vabsq_s32(s0);
+ int32x4_t accum1 = vabsq_s32(s2);
+ accum0 = vabaq_s32(accum0, s1, zero);
+ accum1 = vabaq_s32(accum1, s3, zero);
+
+ length -= 16;
+ coeff += 16;
+
+ while (length != 0) {
+ s0 = vld1q_s32(&coeff[0]);
+ s1 = vld1q_s32(&coeff[4]);
+ s2 = vld1q_s32(&coeff[8]);
+ s3 = vld1q_s32(&coeff[12]);
+
+ accum0 = vabaq_s32(accum0, s0, zero);
+ accum1 = vabaq_s32(accum1, s1, zero);
+ accum0 = vabaq_s32(accum0, s2, zero);
+ accum1 = vabaq_s32(accum1, s3, zero);
+
+ length -= 16;
+ coeff += 16;
+ }
+
+ // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
+ return horizontal_add_s32x4(vaddq_s32(accum0, accum1));
+}
+
+int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl) {
+ assert(bwl >= 2 && bwl <= 5);
+ int width = 4 << bwl;
+
+ int16x8_t r = vld1q_s16(ref);
+ int16x8_t s = vld1q_s16(src);
+
+ // diff: dynamic range [-510, 510] 10 (signed) bits.
+ int16x8_t diff = vsubq_s16(r, s);
+ // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+ int16x8_t v_mean = diff;
+ // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+ int32x4_t v_sse[2];
+ v_sse[0] = vmull_s16(vget_low_s16(diff), vget_low_s16(diff));
+ v_sse[1] = vmull_s16(vget_high_s16(diff), vget_high_s16(diff));
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+
+ do {
+ r = vld1q_s16(ref);
+ s = vld1q_s16(src);
+
+ diff = vsubq_s16(r, s);
+ v_mean = vaddq_s16(v_mean, diff);
+
+ v_sse[0] = vmlal_s16(v_sse[0], vget_low_s16(diff), vget_low_s16(diff));
+ v_sse[1] = vmlal_s16(v_sse[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ ref += 8;
+ src += 8;
+ width -= 8;
+ } while (width != 0);
+
+ // Dynamic range [0, 65280], 16 (unsigned) bits.
+ const uint32_t mean_abs = abs(horizontal_add_s16x8(v_mean));
+ const int32_t sse = horizontal_add_s32x4(vaddq_s32(v_sse[0], v_sse[1]));
+
+ // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits.
+ return sse - ((mean_abs * mean_abs) >> (bwl + 2));
+}
+
+void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int *min, int *max) {
+ // Load and concatenate.
+ const uint8x16_t a01 = load_u8_8x2(a + 0 * a_stride, a_stride);
+ const uint8x16_t a23 = load_u8_8x2(a + 2 * a_stride, a_stride);
+ const uint8x16_t a45 = load_u8_8x2(a + 4 * a_stride, a_stride);
+ const uint8x16_t a67 = load_u8_8x2(a + 6 * a_stride, a_stride);
+
+ const uint8x16_t b01 = load_u8_8x2(b + 0 * b_stride, b_stride);
+ const uint8x16_t b23 = load_u8_8x2(b + 2 * b_stride, b_stride);
+ const uint8x16_t b45 = load_u8_8x2(b + 4 * b_stride, b_stride);
+ const uint8x16_t b67 = load_u8_8x2(b + 6 * b_stride, b_stride);
+
+ // Absolute difference.
+ const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+ const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+ const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+ const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+ // Max values between the Q vectors.
+ const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+ const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+ const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+ const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+#if AOM_ARCH_AARCH64
+ *min = *max = 0; // Clear high bits
+ *((uint8_t *)max) = vmaxvq_u8(ab07_max);
+ *((uint8_t *)min) = vminvq_u8(ab07_min);
+#else
+ // Split into 64-bit vectors and execute pairwise min/max.
+ uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+ uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+ // Enough runs of vpmax/min propagate the max/min values to every position.
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u8((uint8_t *)max, ab_max, 0);
+ vst1_lane_u8((uint8_t *)min, ab_min, 0);
+#endif
+}
diff --git a/third_party/aom/aom_dsp/arm/avg_pred_neon.c b/third_party/aom/aom_dsp/arm/avg_pred_neon.c
new file mode 100644
index 0000000000..b17f7fca7f
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/avg_pred_neon.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ if (width > 8) {
+ do {
+ const uint8_t *pred_ptr = pred;
+ const uint8_t *ref_ptr = ref;
+ uint8_t *comp_pred_ptr = comp_pred;
+ int w = width;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+ const uint8x16_t avg = vrhaddq_u8(p, r);
+
+ vst1q_u8(comp_pred_ptr, avg);
+
+ ref_ptr += 16;
+ pred_ptr += 16;
+ comp_pred_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ ref += ref_stride;
+ pred += width;
+ comp_pred += width;
+ } while (--height != 0);
+ } else if (width == 8) {
+ int h = height / 2;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ const uint8x16_t r = load_u8_8x2(ref, ref_stride);
+ const uint8x16_t avg = vrhaddq_u8(p, r);
+
+ vst1q_u8(comp_pred, avg);
+
+ ref += 2 * ref_stride;
+ pred += 16;
+ comp_pred += 16;
+ } while (--h != 0);
+ } else {
+ int h = height / 4;
+ assert(width == 4);
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ const uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
+ const uint8x16_t avg = vrhaddq_u8(p, r);
+
+ vst1q_u8(comp_pred, avg);
+
+ ref += 4 * ref_stride;
+ pred += 16;
+ comp_pred += 16;
+ } while (--h != 0);
+ }
+}
+
+void aom_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+ if (width > 8) {
+ do {
+ const uint8_t *pred_ptr = pred;
+ const uint8_t *ref_ptr = ref;
+ uint8_t *comp_pred_ptr = comp_pred;
+ int w = width;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred_ptr);
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+
+ const uint8x16_t wtd_avg =
+ dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+ vst1q_u8(comp_pred_ptr, wtd_avg);
+
+ ref_ptr += 16;
+ pred_ptr += 16;
+ comp_pred_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ ref += ref_stride;
+ pred += width;
+ comp_pred += width;
+ } while (--height != 0);
+ } else if (width == 8) {
+ int h = height / 2;
+
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ const uint8x16_t r = load_u8_8x2(ref, ref_stride);
+
+ const uint8x16_t wtd_avg =
+ dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+ vst1q_u8(comp_pred, wtd_avg);
+
+ ref += 2 * ref_stride;
+ pred += 16;
+ comp_pred += 16;
+ } while (--h != 0);
+ } else {
+ int h = height / 2;
+ assert(width == 4);
+
+ do {
+ const uint8x8_t p = vld1_u8(pred);
+ const uint8x8_t r = load_unaligned_u8_4x2(ref, ref_stride);
+
+ const uint8x8_t wtd_avg = dist_wtd_avg_u8x8(r, p, vget_low_u8(fwd_offset),
+ vget_low_u8(bck_offset));
+
+ vst1_u8(comp_pred, wtd_avg);
+
+ ref += 2 * ref_stride;
+ pred += 8;
+ comp_pred += 8;
+ } while (--h != 0);
+ }
+}
+
+void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride,
+ const uint8_t *mask, int mask_stride,
+ int invert_mask) {
+ const uint8_t *src0 = invert_mask ? pred : ref;
+ const uint8_t *src1 = invert_mask ? ref : pred;
+ const int src_stride0 = invert_mask ? width : ref_stride;
+ const int src_stride1 = invert_mask ? ref_stride : width;
+
+ if (width > 8) {
+ do {
+ const uint8_t *src0_ptr = src0;
+ const uint8_t *src1_ptr = src1;
+ const uint8_t *mask_ptr = mask;
+ uint8_t *comp_pred_ptr = comp_pred;
+ int w = width;
+
+ do {
+ const uint8x16_t s0 = vld1q_u8(src0_ptr);
+ const uint8x16_t s1 = vld1q_u8(src1_ptr);
+ const uint8x16_t m0 = vld1q_u8(mask_ptr);
+
+ uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, s0, s1);
+
+ vst1q_u8(comp_pred_ptr, blend_u8);
+
+ src0_ptr += 16;
+ src1_ptr += 16;
+ mask_ptr += 16;
+ comp_pred_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ } while (--height != 0);
+ } else if (width == 8) {
+ do {
+ const uint8x8_t s0 = vld1_u8(src0);
+ const uint8x8_t s1 = vld1_u8(src1);
+ const uint8x8_t m0 = vld1_u8(mask);
+
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(comp_pred, blend_u8);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += 8;
+ } while (--height != 0);
+ } else {
+ int h = height / 2;
+ assert(width == 4);
+
+ do {
+ const uint8x8_t s0 = load_unaligned_u8(src0, src_stride0);
+ const uint8x8_t s1 = load_unaligned_u8(src1, src_stride1);
+ const uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
+
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(comp_pred, blend_u8);
+
+ src0 += 2 * src_stride0;
+ src1 += 2 * src_stride1;
+ mask += 2 * mask_stride;
+ comp_pred += 8;
+ } while (--h != 0);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/avg_sve.c b/third_party/aom/aom_dsp/arm/avg_sve.c
new file mode 100644
index 0000000000..bbf5a9447c
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/avg_sve.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+
+int aom_vector_var_sve(const int16_t *ref, const int16_t *src, int bwl) {
+ assert(bwl >= 2 && bwl <= 5);
+ int width = 4 << bwl;
+
+ int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+ int16x8_t v_mean[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+
+ do {
+ int16x8_t r0 = vld1q_s16(ref);
+ int16x8_t s0 = vld1q_s16(src);
+
+ // diff: dynamic range [-510, 510] 10 (signed) bits.
+ int16x8_t diff0 = vsubq_s16(r0, s0);
+ // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+ v_mean[0] = vaddq_s16(v_mean[0], diff0);
+
+ // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+ sse_s64[0] = aom_sdotq_s16(sse_s64[0], diff0, diff0);
+
+ int16x8_t r1 = vld1q_s16(ref + 8);
+ int16x8_t s1 = vld1q_s16(src + 8);
+
+ // diff: dynamic range [-510, 510] 10 (signed) bits.
+ int16x8_t diff1 = vsubq_s16(r1, s1);
+ // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+ v_mean[1] = vaddq_s16(v_mean[1], diff1);
+
+ // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+ sse_s64[1] = aom_sdotq_s16(sse_s64[1], diff1, diff1);
+
+ ref += 16;
+ src += 16;
+ width -= 16;
+ } while (width != 0);
+
+ // Dynamic range [0, 65280], 16 (unsigned) bits.
+ const uint32_t mean_abs = abs(vaddlvq_s16(vaddq_s16(v_mean[0], v_mean[1])));
+ const int64_t sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1]));
+
+ // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits.
+ return (int)(sse - ((mean_abs * mean_abs) >> (bwl + 2)));
+}
diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
new file mode 100644
index 0000000000..1bc3b80310
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
+ uint16x8_t round_offset) {
+ const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
+ uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a));
+
+ blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));
+ blend_u32_hi =
+ vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));
+
+ uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
+
+ uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi);
+
+ res = vqsubq_u16(res, round_offset);
+
+ return vqrshrn_n_u16(res,
+ 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+}
+
+void aom_lowbd_blend_a64_d16_mask_neon(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params) {
+ (void)conv_params;
+
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+ const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+ const uint16x8_t offset_vec = vdupq_n_u16(round_offset);
+
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 1) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg =
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+ uint8x8_t blend =
+ alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ vst1_u8(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
+
+void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subw, int subh) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if ((subw | subh) == 0) {
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + i);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+ i += 16;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if ((subw & subh) == 1) {
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16);
+ uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 0) {
+ if (w > 8) {
+ do {
+ int i = 0;
+
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + 2 * i);
+ uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask);
+ uint8x8_t m1 = vld1_u8(mask + 8);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ if (w > 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i);
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i);
+ uint8x16_t s0 = vld1q_u8(src0 + i);
+ uint8x16_t s1 = vld1q_u8(src1 + i);
+
+ uint8x16_t m_avg = avg_blend_u8x16(m0, m1);
+ uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+ vst1q_u8(dst + i, blend);
+
+ i += 16;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 8) {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t s0 = vld1_u8(src0);
+ uint8x8_t s1 = vld1_u8(src1);
+
+ uint8x8_t m_avg = avg_blend_u8x8(m0, m1);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ vst1_u8(dst, blend);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+ uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+ uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
+ uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+ store_u8x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/blend_neon.h b/third_party/aom/aom_dsp/arm/blend_neon.h
new file mode 100644
index 0000000000..c8a03224e4
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blend_neon.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_BLEND_NEON_H_
+#define AOM_AOM_DSP_ARM_BLEND_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/blend.h"
+
+static INLINE uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a,
+ uint8x16_t b) {
+ const uint8x16_t m_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m), vget_low_u8(a));
+ uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m), vget_high_u8(a));
+
+ blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m_inv), vget_low_u8(b));
+ blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m_inv), vget_high_u8(b));
+
+ uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
+
+ return vcombine_u8(blend_u8_lo, blend_u8_hi);
+}
+
+static INLINE uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a,
+ uint8x8_t b) {
+ const uint8x8_t m_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint16x8_t blend_u16 = vmull_u8(m, a);
+
+ blend_u16 = vmlal_u8(blend_u16, m_inv, b);
+
+ return vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a,
+ uint16x8_t b) {
+ uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(a), vget_low_u16(m));
+ uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(a), vget_high_u16(m));
+
+ blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(b), vget_low_u16(m_inv));
+ blend_u32_hi =
+ vmlal_u16(blend_u32_hi, vget_high_u16(b), vget_high_u16(m_inv));
+
+ uint16x4_t blend_u16_lo =
+ vrshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint16x4_t blend_u16_hi =
+ vrshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
+
+ return vcombine_u16(blend_u16_lo, blend_u16_hi);
+}
+
+static INLINE uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a,
+ uint16x4_t b) {
+ const uint16x4_t m_inv = vsub_u16(vdup_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+ uint32x4_t blend_u16 = vmull_u16(m, a);
+
+ blend_u16 = vmlal_u16(blend_u16, m_inv, b);
+
+ return vrshrn_n_u32(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) {
+ return vrhadd_u8(a, b);
+}
+
+static INLINE uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) {
+ return vrhaddq_u8(a, b);
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) {
+ return vrshr_n_u8(vpadd_u8(a, b), 1);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) {
+#if AOM_ARCH_AARCH64
+ return vrshrq_n_u8(vpaddq_u8(a, b), 1);
+#else
+ uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+ uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+ return vrshrq_n_u8(vcombine_u8(sum_pairwise_a, sum_pairwise_b), 1);
+#endif // AOM_ARCH_AARCH64
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b,
+ uint8x8_t c, uint8x8_t d) {
+ uint8x8_t a_c = vpadd_u8(a, c);
+ uint8x8_t b_d = vpadd_u8(b, d);
+ return vrshr_n_u8(vqadd_u8(a_c, b_d), 2);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b,
+ uint8x16_t c,
+ uint8x16_t d) {
+#if AOM_ARCH_AARCH64
+ uint8x16_t a_c = vpaddq_u8(a, c);
+ uint8x16_t b_d = vpaddq_u8(b, d);
+ return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#else
+ uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+ uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+ uint8x8_t sum_pairwise_c = vpadd_u8(vget_low_u8(c), vget_high_u8(c));
+ uint8x8_t sum_pairwise_d = vpadd_u8(vget_low_u8(d), vget_high_u8(d));
+ uint8x16_t a_c = vcombine_u8(sum_pairwise_a, sum_pairwise_c);
+ uint8x16_t b_d = vcombine_u8(sum_pairwise_b, sum_pairwise_d);
+ return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#endif // AOM_ARCH_AARCH64
+}
+
+#endif // AOM_AOM_DSP_ARM_BLEND_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c b/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c
new file mode 100644
index 0000000000..f2ada93e95
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int i = bh;
+ int32x4_t sum = vdupq_n_s32(0);
+ int32x4_t sse = vdupq_n_s32(0);
+
+ do {
+ int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride));
+
+ sum = vpadalq_s16(sum, d);
+
+ sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+ sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+ data += 2 * stride;
+ i -= 2;
+ } while (i != 0);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int i = bh;
+ int32x4_t sum = vdupq_n_s32(0);
+ int32x4_t sse = vdupq_n_s32(0);
+
+ // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+ // 32-bits element. Since we're accumulating into an int32x4_t and the maximum
+ // value for bh is 32, we don't have to worry about sse overflowing.
+
+ do {
+ int16x8_t d = vld1q_s16(data);
+
+ sum = vpadalq_s16(sum, d);
+
+ sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+ sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+ data += stride;
+ } while (--i != 0);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_large_neon(const int16_t *data, int stride,
+ int bw, int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int32x4_t sum = vdupq_n_s32(0);
+ int64x2_t sse = vdupq_n_s64(0);
+
+ // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+ // 32-bits element. Since we're accumulating into an int32x4_t vector that
+ // means we can process up to (127*4)/bw rows before we need to widen to
+ // 64 bits.
+
+ int i_limit = (127 * 4) / bw;
+ int i_tmp = bh > i_limit ? i_limit : bh;
+
+ int i = 0;
+ do {
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ do {
+ int j = bw;
+ const int16_t *data_ptr = data;
+ do {
+ int16x8_t d = vld1q_s16(data_ptr);
+
+ sum = vpadalq_s16(sum, d);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(d), vget_low_s16(d));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(d), vget_high_s16(d));
+
+ data_ptr += 8;
+ j -= 8;
+ } while (j != 0);
+
+ data += stride;
+ i++;
+ } while (i < i_tmp && i < bh);
+
+ sse = vpadalq_s32(sse, sse_s32);
+ i_tmp += i_limit;
+ } while (i < bh);
+
+ *x_sum = horizontal_add_s32x4(sum);
+ *x2_sum = horizontal_add_s64x2(sse);
+}
+
+void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ if (bw == 4) {
+ get_blk_sse_sum_4xh_neon(data, stride, bh, x_sum, x2_sum);
+ } else if (bw == 8) {
+ get_blk_sse_sum_8xh_neon(data, stride, bh, x_sum, x2_sum);
+ } else {
+ assert(bw % 8 == 0);
+ get_blk_sse_sum_large_neon(data, stride, bw, bh, x_sum, x2_sum);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c b/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c
new file mode 100644
index 0000000000..18bdc5dbfe
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static INLINE void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int32x4_t sum = vdupq_n_s32(0);
+ int64x2_t sse = vdupq_n_s64(0);
+
+ do {
+ int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride));
+
+ sum = vpadalq_s16(sum, d);
+
+ sse = aom_sdotq_s16(sse, d, d);
+
+ data += 2 * stride;
+ bh -= 2;
+ } while (bh != 0);
+
+ *x_sum = vaddvq_s32(sum);
+ *x2_sum = vaddvq_s64(sse);
+}
+
+static INLINE void get_blk_sse_sum_8xh_sve(const int16_t *data, int stride,
+ int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ int16x8_t d0 = vld1q_s16(data);
+ int16x8_t d1 = vld1q_s16(data + stride);
+
+ sum[0] = vpadalq_s16(sum[0], d0);
+ sum[1] = vpadalq_s16(sum[1], d1);
+
+ sse[0] = aom_sdotq_s16(sse[0], d0, d0);
+ sse[1] = aom_sdotq_s16(sse[1], d1, d1);
+
+ data += 2 * stride;
+ bh -= 2;
+ } while (bh != 0);
+
+ *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1]));
+ *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+static INLINE void get_blk_sse_sum_large_sve(const int16_t *data, int stride,
+ int bw, int bh, int *x_sum,
+ int64_t *x2_sum) {
+ int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ int j = bw;
+ const int16_t *data_ptr = data;
+ do {
+ int16x8_t d0 = vld1q_s16(data_ptr);
+ int16x8_t d1 = vld1q_s16(data_ptr + 8);
+
+ sum[0] = vpadalq_s16(sum[0], d0);
+ sum[1] = vpadalq_s16(sum[1], d1);
+
+ sse[0] = aom_sdotq_s16(sse[0], d0, d0);
+ sse[1] = aom_sdotq_s16(sse[1], d1, d1);
+
+ data_ptr += 16;
+ j -= 16;
+ } while (j != 0);
+
+ data += stride;
+ } while (--bh != 0);
+
+ *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1]));
+ *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+void aom_get_blk_sse_sum_sve(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ if (bw == 4) {
+ get_blk_sse_sum_4xh_sve(data, stride, bh, x_sum, x2_sum);
+ } else if (bw == 8) {
+ get_blk_sse_sum_8xh_sve(data, stride, bh, x_sum, x2_sum);
+ } else {
+ assert(bw % 16 == 0);
+ get_blk_sse_sum_large_sve(data, stride, bw, bh, x_sum, x2_sum);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h b/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h
new file mode 100644
index 0000000000..19c9b04c57
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+#define AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+
+static INLINE uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
+ uint8x8_t wta, uint8x8_t wtb) {
+ uint16x8_t wtd_sum = vmull_u8(a, wta);
+
+ wtd_sum = vmlal_u8(wtd_sum, b, wtb);
+
+ return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
+ uint16x4_t wta, uint16x4_t wtb) {
+ uint32x4_t wtd_sum = vmull_u16(a, wta);
+
+ wtd_sum = vmlal_u16(wtd_sum, b, wtb);
+
+ return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
+ uint8x16_t wta, uint8x16_t wtb) {
+ uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta));
+ uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta));
+
+ wtd_sum_lo = vmlal_u8(wtd_sum_lo, vget_low_u8(b), vget_low_u8(wtb));
+ wtd_sum_hi = vmlal_u8(wtd_sum_hi, vget_high_u8(b), vget_high_u8(wtb));
+
+ uint8x8_t wtd_avg_lo = vrshrn_n_u16(wtd_sum_lo, DIST_PRECISION_BITS);
+ uint8x8_t wtd_avg_hi = vrshrn_n_u16(wtd_sum_hi, DIST_PRECISION_BITS);
+
+ return vcombine_u8(wtd_avg_lo, wtd_avg_hi);
+}
+
+static INLINE uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b,
+ uint16x8_t wta, uint16x8_t wtb) {
+ uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta));
+ uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta));
+
+ wtd_sum_lo = vmlal_u16(wtd_sum_lo, vget_low_u16(b), vget_low_u16(wtb));
+ wtd_sum_hi = vmlal_u16(wtd_sum_hi, vget_high_u16(b), vget_high_u16(wtb));
+
+ uint16x4_t wtd_avg_lo = vrshrn_n_u32(wtd_sum_lo, DIST_PRECISION_BITS);
+ uint16x4_t wtd_avg_hi = vrshrn_n_u32(wtd_sum_hi, DIST_PRECISION_BITS);
+
+ return vcombine_u16(wtd_avg_lo, wtd_avg_hi);
+}
+
+#endif // AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/dot_sve.h b/third_party/aom/aom_dsp/arm/dot_sve.h
new file mode 100644
index 0000000000..cf49f23606
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/dot_sve.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_DOT_SVE_H_
+#define AOM_AOM_DSP_ARM_DOT_SVE_H_
+
+#include <arm_neon_sve_bridge.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+// Dot product instructions operating on 16-bit input elements are exclusive to
+// the SVE instruction set. However, we can access these instructions from a
+// predominantly Neon context by making use of the Neon-SVE bridge intrinsics
+// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE
+// vector (if it's longer than 128 bits) being "don't care".
+
+// While sub-optimal on machines that have SVE vector length > 128-bit - as the
+// remainder of the vector is unused - this approach is still beneficial when
+// compared to a Neon-only solution.
+
+static INLINE uint64x2_t aom_udotq_u16(uint64x2_t acc, uint16x8_t x,
+ uint16x8_t y) {
+ return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+ svset_neonq_u16(svundef_u16(), x),
+ svset_neonq_u16(svundef_u16(), y)));
+}
+
+static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+ return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+ svset_neonq_s16(svundef_s16(), x),
+ svset_neonq_s16(svundef_s16(), y)));
+}
+
+#endif // AOM_AOM_DSP_ARM_DOT_SVE_H_
diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
new file mode 100644
index 0000000000..a4d6322f24
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static void aom_fdct4x4_helper(const int16_t *input, int stride,
+ int16x4_t *input_0, int16x4_t *input_1,
+ int16x4_t *input_2, int16x4_t *input_3) {
+ *input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+ *input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+ *input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+ *input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+ // If the very first value != 0, then add 1.
+ if (input[0] != 0) {
+ const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+ *input_0 = vadd_s16(*input_0, one);
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ const int16x8_t input_01 = vcombine_s16(*input_0, *input_1);
+ const int16x8_t input_32 = vcombine_s16(*input_3, *input_2);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // (s_0 +/- s_1) * cospi_16_64
+ // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+ const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+ const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+ const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int32_t)cospi_16_64);
+ const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int32_t)cospi_16_64);
+
+ // fdct_round_shift
+ int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+ int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int32_t)cospi_8_64);
+ const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int32_t)cospi_24_64);
+
+ const int32x4_t temp3 =
+ vmlal_n_s16(s_3_cospi_8_64, s_2, (int32_t)cospi_24_64);
+ const int32x4_t temp4 =
+ vmlsl_n_s16(s_3_cospi_24_64, s_2, (int32_t)cospi_8_64);
+
+ // fdct_round_shift
+ int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+ int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+ // Only transpose the first pass
+ if (i == 0) {
+ transpose_elems_inplace_s16_4x4(&out_0, &out_1, &out_2, &out_3);
+ }
+
+ *input_0 = out_0;
+ *input_1 = out_1;
+ *input_2 = out_2;
+ *input_3 = out_3;
+ }
+}
+
+void aom_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ // input[M * stride] * 16
+ int16x4_t input_0, input_1, input_2, input_3;
+
+ aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3);
+
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ const int16x8_t one = vdupq_n_s16(1);
+ int16x8_t out_01 = vcombine_s16(input_0, input_1);
+ int16x8_t out_23 = vcombine_s16(input_2, input_3);
+ out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+ out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+ store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+ store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+}
+
+void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output,
+ int stride) {
+ // input[M * stride] * 16
+ int16x4_t input_0, input_1, input_2, input_3;
+
+ aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3);
+
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ const int16x8_t one = vdupq_n_s16(1);
+ int16x8_t out_01 = vcombine_s16(input_0, input_1);
+ int16x8_t out_23 = vcombine_s16(input_2, input_3);
+ out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+ out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+ vst1q_s16(final_output + 0 * 8, out_01);
+ vst1q_s16(final_output + 1 * 8, out_23);
+}
+
+void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+ // stage 1
+ int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+ for (int i = 0; i < 2; ++i) {
+ int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+ const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+ const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+ const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+ const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+ const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+ const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+ const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+ const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+ // fdct4(step, step);
+ int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+ int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+ int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+ int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+ // fdct4(step, step);
+ int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+ int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+ v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+ v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+ v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
+ v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
+ v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
+ v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
+ out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
+ out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
+ out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
+ }
+ // Stage 2
+ v_x0 = vsubq_s16(v_s6, v_s5);
+ v_x1 = vaddq_s16(v_s6, v_s5);
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x8_t ab = vcombine_s16(a, b);
+ const int16x8_t cd = vcombine_s16(c, d);
+ // Stage 3
+ v_x0 = vaddq_s16(v_s4, ab);
+ v_x1 = vsubq_s16(v_s4, ab);
+ v_x2 = vsubq_s16(v_s7, cd);
+ v_x3 = vaddq_s16(v_s7, cd);
+ }
+ // Stage 4
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+ v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+ v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+ v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+ v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+ v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
+ out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
+ out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
+ out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
+ }
+ // transpose 8x8
+ {
+ // 00 01 02 03 40 41 42 43
+ // 10 11 12 13 50 51 52 53
+ // 20 21 22 23 60 61 62 63
+ // 30 31 32 33 70 71 72 73
+ // 04 05 06 07 44 45 46 47
+ // 14 15 16 17 54 55 56 57
+ // 24 25 26 27 64 65 66 67
+ // 34 35 36 37 74 75 76 77
+ const int32x4x2_t r02_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
+ const int32x4x2_t r13_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
+ const int32x4x2_t r46_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
+ const int32x4x2_t r57_s32 =
+ vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
+ const int16x8x2_t r01_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+ vreinterpretq_s16_s32(r13_s32.val[0]));
+ const int16x8x2_t r23_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+ vreinterpretq_s16_s32(r13_s32.val[1]));
+ const int16x8x2_t r45_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+ vreinterpretq_s16_s32(r57_s32.val[0]));
+ const int16x8x2_t r67_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+ vreinterpretq_s16_s32(r57_s32.val[1]));
+ input_0 = r01_s16.val[0];
+ input_1 = r01_s16.val[1];
+ input_2 = r23_s16.val[0];
+ input_3 = r23_s16.val[1];
+ input_4 = r45_s16.val[0];
+ input_5 = r45_s16.val[1];
+ input_6 = r67_s16.val[0];
+ input_7 = r67_s16.val[1];
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ } // for
+ {
+ // from aom_dct_sse2.c
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+ input_0 = vhsubq_s16(input_0, sign_in0);
+ input_1 = vhsubq_s16(input_1, sign_in1);
+ input_2 = vhsubq_s16(input_2, sign_in2);
+ input_3 = vhsubq_s16(input_3, sign_in3);
+ input_4 = vhsubq_s16(input_4, sign_in4);
+ input_5 = vhsubq_s16(input_5, sign_in5);
+ input_6 = vhsubq_s16(input_6, sign_in6);
+ input_7 = vhsubq_s16(input_7, sign_in7);
+ // store results
+ vst1q_s16(&final_output[0 * 8], input_0);
+ vst1q_s16(&final_output[1 * 8], input_1);
+ vst1q_s16(&final_output[2 * 8], input_2);
+ vst1q_s16(&final_output[3 * 8], input_3);
+ vst1q_s16(&final_output[4 * 8], input_4);
+ vst1q_s16(&final_output[5 * 8], input_5);
+ vst1q_s16(&final_output[6 * 8], input_6);
+ vst1q_s16(&final_output[7 * 8], input_7);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c
new file mode 100644
index 0000000000..d0f59227db
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/hadamard_neon.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1,
+ int16x4_t *a2, int16x4_t *a3) {
+ const int16x4_t b0 = vhadd_s16(*a0, *a1);
+ const int16x4_t b1 = vhsub_s16(*a0, *a1);
+ const int16x4_t b2 = vhadd_s16(*a2, *a3);
+ const int16x4_t b3 = vhsub_s16(*a2, *a3);
+
+ *a0 = vadd_s16(b0, b2);
+ *a1 = vadd_s16(b1, b3);
+ *a2 = vsub_s16(b0, b2);
+ *a3 = vsub_s16(b1, b3);
+}
+
+void aom_hadamard_4x4_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x4_t a0 = vld1_s16(src_diff);
+ int16x4_t a1 = vld1_s16(src_diff + src_stride);
+ int16x4_t a2 = vld1_s16(src_diff + 2 * src_stride);
+ int16x4_t a3 = vld1_s16(src_diff + 3 * src_stride);
+
+ hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
+
+ transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
+
+ hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
+
+ store_s16_to_tran_low(coeff, a0);
+ store_s16_to_tran_low(coeff + 4, a1);
+ store_s16_to_tran_low(coeff + 8, a2);
+ store_s16_to_tran_low(coeff + 12, a3);
+}
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+ int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6, int16x8_t *a7) {
+ const int16x8_t b0 = vaddq_s16(*a0, *a1);
+ const int16x8_t b1 = vsubq_s16(*a0, *a1);
+ const int16x8_t b2 = vaddq_s16(*a2, *a3);
+ const int16x8_t b3 = vsubq_s16(*a2, *a3);
+ const int16x8_t b4 = vaddq_s16(*a4, *a5);
+ const int16x8_t b5 = vsubq_s16(*a4, *a5);
+ const int16x8_t b6 = vaddq_s16(*a6, *a7);
+ const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+ const int16x8_t c0 = vaddq_s16(b0, b2);
+ const int16x8_t c1 = vaddq_s16(b1, b3);
+ const int16x8_t c2 = vsubq_s16(b0, b2);
+ const int16x8_t c3 = vsubq_s16(b1, b3);
+ const int16x8_t c4 = vaddq_s16(b4, b6);
+ const int16x8_t c5 = vaddq_s16(b5, b7);
+ const int16x8_t c6 = vsubq_s16(b4, b6);
+ const int16x8_t c7 = vsubq_s16(b5, b7);
+
+ *a0 = vaddq_s16(c0, c4);
+ *a1 = vsubq_s16(c2, c6);
+ *a2 = vsubq_s16(c0, c4);
+ *a3 = vaddq_s16(c2, c6);
+ *a4 = vaddq_s16(c3, c7);
+ *a5 = vsubq_s16(c3, c7);
+ *a6 = vsubq_s16(c1, c5);
+ *a7 = vaddq_s16(c1, c5);
+}
+
+void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x8_t a0 = vld1q_s16(src_diff);
+ int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+ int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ // Skip the second transpose because it is not required.
+
+ store_s16q_to_tran_low(coeff + 0, a0);
+ store_s16q_to_tran_low(coeff + 8, a1);
+ store_s16q_to_tran_low(coeff + 16, a2);
+ store_s16q_to_tran_low(coeff + 24, a3);
+ store_s16q_to_tran_low(coeff + 32, a4);
+ store_s16q_to_tran_low(coeff + 40, a5);
+ store_s16q_to_tran_low(coeff + 48, a6);
+ store_s16q_to_tran_low(coeff + 56, a7);
+}
+
+void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ int16x8_t a0 = vld1q_s16(src_diff);
+ int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+ int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ // Skip the second transpose because it is not required.
+
+ vst1q_s16(coeff + 0, a0);
+ vst1q_s16(coeff + 8, a1);
+ vst1q_s16(coeff + 16, a2);
+ vst1q_s16(coeff + 24, a3);
+ vst1q_s16(coeff + 32, a4);
+ vst1q_s16(coeff + 40, a5);
+ vst1q_s16(coeff + 48, a6);
+ vst1q_s16(coeff + 56, a7);
+}
+
+void aom_hadamard_lp_8x8_dual_neon(const int16_t *src_diff,
+ ptrdiff_t src_stride, int16_t *coeff) {
+ for (int i = 0; i < 2; i++) {
+ aom_hadamard_lp_8x8_neon(src_diff + (i * 8), src_stride, coeff + (i * 64));
+ }
+}
+
+void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride,
+ coeff + 0);
+ /* Top right. */
+ aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride,
+ coeff + 64);
+ /* Bottom left. */
+ aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride,
+ coeff + 128);
+ /* Bottom right. */
+ aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride,
+ coeff + 192);
+
+ for (int i = 0; i < 64; i += 8) {
+ const int16x8_t a0 = vld1q_s16(coeff + 0);
+ const int16x8_t a1 = vld1q_s16(coeff + 64);
+ const int16x8_t a2 = vld1q_s16(coeff + 128);
+ const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+ const int16x8_t b0 = vhaddq_s16(a0, a1);
+ const int16x8_t b1 = vhsubq_s16(a0, a1);
+ const int16x8_t b2 = vhaddq_s16(a2, a3);
+ const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+ const int16x8_t c0 = vaddq_s16(b0, b2);
+ const int16x8_t c1 = vaddq_s16(b1, b3);
+ const int16x8_t c2 = vsubq_s16(b0, b2);
+ const int16x8_t c3 = vsubq_s16(b1, b3);
+
+ vst1q_s16(coeff + 0, c0);
+ vst1q_s16(coeff + 64, c1);
+ vst1q_s16(coeff + 128, c2);
+ vst1q_s16(coeff + 192, c3);
+
+ coeff += 8;
+ }
+}
+
+void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+ /* Bottom left. */
+ aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+ /* Bottom right. */
+ aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+ // Each iteration of the loop operates on entire rows (16 samples each)
+ // because we need to swap the second and third quarters of every row in the
+ // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for
+ // loop at the end of aom_hadamard_16x16_c.
+ for (int i = 0; i < 64; i += 16) {
+ const int32x4_t a00 = vld1q_s32(coeff + 0);
+ const int32x4_t a01 = vld1q_s32(coeff + 64);
+ const int32x4_t a02 = vld1q_s32(coeff + 128);
+ const int32x4_t a03 = vld1q_s32(coeff + 192);
+
+ const int32x4_t b00 = vhaddq_s32(a00, a01);
+ const int32x4_t b01 = vhsubq_s32(a00, a01);
+ const int32x4_t b02 = vhaddq_s32(a02, a03);
+ const int32x4_t b03 = vhsubq_s32(a02, a03);
+
+ const int32x4_t c00 = vaddq_s32(b00, b02);
+ const int32x4_t c01 = vaddq_s32(b01, b03);
+ const int32x4_t c02 = vsubq_s32(b00, b02);
+ const int32x4_t c03 = vsubq_s32(b01, b03);
+
+ const int32x4_t a10 = vld1q_s32(coeff + 4 + 0);
+ const int32x4_t a11 = vld1q_s32(coeff + 4 + 64);
+ const int32x4_t a12 = vld1q_s32(coeff + 4 + 128);
+ const int32x4_t a13 = vld1q_s32(coeff + 4 + 192);
+
+ const int32x4_t b10 = vhaddq_s32(a10, a11);
+ const int32x4_t b11 = vhsubq_s32(a10, a11);
+ const int32x4_t b12 = vhaddq_s32(a12, a13);
+ const int32x4_t b13 = vhsubq_s32(a12, a13);
+
+ const int32x4_t c10 = vaddq_s32(b10, b12);
+ const int32x4_t c11 = vaddq_s32(b11, b13);
+ const int32x4_t c12 = vsubq_s32(b10, b12);
+ const int32x4_t c13 = vsubq_s32(b11, b13);
+
+ const int32x4_t a20 = vld1q_s32(coeff + 8 + 0);
+ const int32x4_t a21 = vld1q_s32(coeff + 8 + 64);
+ const int32x4_t a22 = vld1q_s32(coeff + 8 + 128);
+ const int32x4_t a23 = vld1q_s32(coeff + 8 + 192);
+
+ const int32x4_t b20 = vhaddq_s32(a20, a21);
+ const int32x4_t b21 = vhsubq_s32(a20, a21);
+ const int32x4_t b22 = vhaddq_s32(a22, a23);
+ const int32x4_t b23 = vhsubq_s32(a22, a23);
+
+ const int32x4_t c20 = vaddq_s32(b20, b22);
+ const int32x4_t c21 = vaddq_s32(b21, b23);
+ const int32x4_t c22 = vsubq_s32(b20, b22);
+ const int32x4_t c23 = vsubq_s32(b21, b23);
+
+ const int32x4_t a30 = vld1q_s32(coeff + 12 + 0);
+ const int32x4_t a31 = vld1q_s32(coeff + 12 + 64);
+ const int32x4_t a32 = vld1q_s32(coeff + 12 + 128);
+ const int32x4_t a33 = vld1q_s32(coeff + 12 + 192);
+
+ const int32x4_t b30 = vhaddq_s32(a30, a31);
+ const int32x4_t b31 = vhsubq_s32(a30, a31);
+ const int32x4_t b32 = vhaddq_s32(a32, a33);
+ const int32x4_t b33 = vhsubq_s32(a32, a33);
+
+ const int32x4_t c30 = vaddq_s32(b30, b32);
+ const int32x4_t c31 = vaddq_s32(b31, b33);
+ const int32x4_t c32 = vsubq_s32(b30, b32);
+ const int32x4_t c33 = vsubq_s32(b31, b33);
+
+ vst1q_s32(coeff + 0 + 0, c00);
+ vst1q_s32(coeff + 0 + 4, c20);
+ vst1q_s32(coeff + 0 + 8, c10);
+ vst1q_s32(coeff + 0 + 12, c30);
+
+ vst1q_s32(coeff + 64 + 0, c01);
+ vst1q_s32(coeff + 64 + 4, c21);
+ vst1q_s32(coeff + 64 + 8, c11);
+ vst1q_s32(coeff + 64 + 12, c31);
+
+ vst1q_s32(coeff + 128 + 0, c02);
+ vst1q_s32(coeff + 128 + 4, c22);
+ vst1q_s32(coeff + 128 + 8, c12);
+ vst1q_s32(coeff + 128 + 12, c32);
+
+ vst1q_s32(coeff + 192 + 0, c03);
+ vst1q_s32(coeff + 192 + 4, c23);
+ vst1q_s32(coeff + 192 + 8, c13);
+ vst1q_s32(coeff + 192 + 12, c33);
+
+ coeff += 16;
+ }
+}
+
+void aom_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ /* Top left first. */
+ aom_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+ /* Top right. */
+ aom_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+ coeff + 256);
+ /* Bottom left. */
+ aom_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+ coeff + 512);
+ /* Bottom right. */
+ aom_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+ coeff + 768);
+
+ for (int i = 0; i < 256; i += 4) {
+ const int32x4_t a0 = vld1q_s32(coeff);
+ const int32x4_t a1 = vld1q_s32(coeff + 256);
+ const int32x4_t a2 = vld1q_s32(coeff + 512);
+ const int32x4_t a3 = vld1q_s32(coeff + 768);
+
+ const int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
+ const int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
+ const int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
+ const int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
+
+ const int32x4_t c0 = vaddq_s32(b0, b2);
+ const int32x4_t c1 = vaddq_s32(b1, b3);
+ const int32x4_t c2 = vsubq_s32(b0, b2);
+ const int32x4_t c3 = vsubq_s32(b1, b3);
+
+ vst1q_s32(coeff + 0, c0);
+ vst1q_s32(coeff + 256, c1);
+ vst1q_s32(coeff + 512, c2);
+ vst1q_s32(coeff + 768, c3);
+
+ coeff += 4;
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_avg_neon.c b/third_party/aom/aom_dsp/arm/highbd_avg_neon.c
new file mode 100644
index 0000000000..47d5dae012
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_avg_neon.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+
+uint32_t aom_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+ uint16x4_t sum, a0, a1, a2, a3;
+
+ load_u16_4x4(a_ptr, a_stride, &a0, &a1, &a2, &a3);
+
+ sum = vadd_u16(a0, a1);
+ sum = vadd_u16(sum, a2);
+ sum = vadd_u16(sum, a3);
+
+ return (horizontal_add_u16x4(sum) + (1 << 3)) >> 4;
+}
+
+uint32_t aom_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+ uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+ load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ sum = vaddq_u16(a0, a1);
+ sum = vaddq_u16(sum, a2);
+ sum = vaddq_u16(sum, a3);
+ sum = vaddq_u16(sum, a4);
+ sum = vaddq_u16(sum, a5);
+ sum = vaddq_u16(sum, a6);
+ sum = vaddq_u16(sum, a7);
+
+ return (horizontal_add_u16x8(sum) + (1 << 5)) >> 6;
+}
+
+void aom_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
+ int dp, int *min, int *max) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
+
+ const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
+ const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
+ const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
+ const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
+ const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
+ const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
+ const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
+ const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
+
+ const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
+ const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
+ const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
+ const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
+ const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
+ const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
+ const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
+ const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
+
+ const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
+ const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
+ const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
+ const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
+ const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
+ const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
+ const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
+ const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
+
+ const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
+ const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
+ const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
+ const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
+
+ const uint16x8_t max0123 = vmaxq_u16(max01, max23);
+ const uint16x8_t max4567 = vmaxq_u16(max45, max67);
+ const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
+
+ const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
+ const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
+ const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
+ const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
+
+ const uint16x8_t min0123 = vminq_u16(min01, min23);
+ const uint16x8_t min4567 = vminq_u16(min45, min67);
+ const uint16x8_t min07 = vminq_u16(min0123, min4567);
+
+#if AOM_ARCH_AARCH64
+ *max = (int)vmaxvq_u16(max07);
+ *min = (int)vminvq_u16(min07);
+#else
+ // Split into 64-bit vectors and execute pairwise min/max.
+ uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
+ uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
+
+ // Enough runs of vpmax/min propagate the max/min values to every position.
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ ab_max = vpmax_u16(ab_max, ab_max);
+ ab_min = vpmin_u16(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u16((uint16_t *)max, ab_max, 0);
+ vst1_lane_u16((uint16_t *)min, ab_min, 0);
+#endif
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000000..531309b025
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ int i = height;
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t avg = vrhaddq_u16(p, r);
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t avg = vrhadd_u16(p, r);
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--i != 0);
+ }
+}
+
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int src_stride0 = invert_mask ? width : ref_stride;
+ const int src_stride1 = invert_mask ? ref_stride : width;
+
+ if (width >= 8) {
+ do {
+ int j = 0;
+
+ do {
+ const uint16x8_t s0 = vld1q_u16(src0 + j);
+ const uint16x8_t s1 = vld1q_u16(src1 + j);
+ const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j));
+
+ uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(comp_pred + j, blend_u16);
+
+ j += 8;
+ } while (j < width);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ } while (--height != 0);
+ } else {
+ assert(width == 4);
+
+ do {
+ const uint16x4_t s0 = vld1_u16(src0);
+ const uint16x4_t s1 = vld1_u16(src1);
+ const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask)));
+
+ uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ vst1_u16(comp_pred, blend_u16);
+
+ src0 += src_stride0;
+ src1 += src_stride1;
+ mask += mask_stride;
+ comp_pred += 4;
+ } while (--height != 0);
+ }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_neon(
+ uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+ const uint8_t *ref8, int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x8_t fwd_offset_u16 = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset_u16 = vdupq_n_u16(jcp_param->bck_offset);
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ if (width > 8) {
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t p = vld1q_u16(pred + j);
+ const uint16x8_t r = vld1q_u16(ref + j);
+
+ const uint16x8_t avg =
+ dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+ vst1q_u16(comp_pred + j, avg);
+
+ j += 8;
+ } while (j < width);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ } else if (width == 8) {
+ do {
+ const uint16x8_t p = vld1q_u16(pred);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ const uint16x8_t avg =
+ dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+ vst1q_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ } else {
+ assert(width == 4);
+ do {
+ const uint16x4_t p = vld1_u16(pred);
+ const uint16x4_t r = vld1_u16(ref);
+
+ const uint16x4_t avg = dist_wtd_avg_u16x4(
+ r, p, vget_low_u16(fwd_offset_u16), vget_low_u16(bck_offset_u16));
+
+ vst1_u16(comp_pred, avg);
+
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ } while (--height != 0);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
new file mode 100644
index 0000000000..8b03e91ac3
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_hmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ int w, int h, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ const uint16x8_t m0 = vmovl_u8(load_unaligned_dup_u8_4x2(mask));
+ do {
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 8) {
+ const uint16x4_t m0 =
+ vget_low_u16(vmovl_u8(load_unaligned_dup_u8_2x4(mask)));
+ do {
+ uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+ uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ store_u16x2_strided_x2(dst, dst_stride, blend);
+
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_highbd_blend_a64_hmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, w, h, bd);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c
new file mode 100644
index 0000000000..90b44fcc5e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+#define HBD_BLEND_A64_D16_MASK(bd, round0_bits) \
+ static INLINE uint16x8_t alpha_##bd##_blend_a64_d16_u16x8( \
+ uint16x8_t m, uint16x8_t a, uint16x8_t b, int32x4_t round_offset) { \
+ const uint16x8_t m_inv = \
+ vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); \
+ \
+ uint32x4_t blend_u32_lo = vmlal_u16(vreinterpretq_u32_s32(round_offset), \
+ vget_low_u16(m), vget_low_u16(a)); \
+ uint32x4_t blend_u32_hi = vmlal_u16(vreinterpretq_u32_s32(round_offset), \
+ vget_high_u16(m), vget_high_u16(a)); \
+ \
+ blend_u32_lo = \
+ vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b)); \
+ blend_u32_hi = \
+ vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b)); \
+ \
+ uint16x4_t blend_u16_lo = \
+ vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_lo), \
+ AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \
+ round0_bits - COMPOUND_ROUND1_BITS); \
+ uint16x4_t blend_u16_hi = \
+ vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_hi), \
+ AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \
+ round0_bits - COMPOUND_ROUND1_BITS); \
+ \
+ uint16x8_t blend_u16 = vcombine_u16(blend_u16_lo, blend_u16_hi); \
+ blend_u16 = vminq_u16(blend_u16, vdupq_n_u16((1 << bd) - 1)); \
+ \
+ return blend_u16; \
+ } \
+ \
+ static INLINE void highbd_##bd##_blend_a64_d16_mask_neon( \
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, \
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, \
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, \
+ int subh) { \
+ const int offset_bits = bd + 2 * FILTER_BITS - round0_bits; \
+ int32_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + \
+ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); \
+ int32x4_t offset = \
+ vdupq_n_s32(-(round_offset << AOM_BLEND_A64_ROUND_BITS)); \
+ \
+ if ((subw | subh) == 0) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \
+ \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else if ((subw & subh) == 1) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i); \
+ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4( \
+ vget_low_u8(m0), vget_low_u8(m1), vget_high_u8(m0), \
+ vget_high_u8(m1))); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); \
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = \
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
+ \
+ mask += 4 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else if (subw == 1 && subh == 0) { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 2 * i); \
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } else { \
+ if (w >= 8) { \
+ do { \
+ int i = 0; \
+ do { \
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i); \
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i); \
+ uint16x8_t s0 = vld1q_u16(src0 + i); \
+ uint16x8_t s1 = vld1q_u16(src1 + i); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ vst1q_u16(dst + i, blend); \
+ i += 8; \
+ } while (i < w); \
+ \
+ mask += 2 * mask_stride; \
+ src0 += src0_stride; \
+ src1 += src1_stride; \
+ dst += dst_stride; \
+ } while (--h != 0); \
+ } else { \
+ do { \
+ uint8x8_t m0_2 = \
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); \
+ uint8x8_t m1_3 = \
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); \
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \
+ \
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); \
+ uint16x8_t blend = \
+ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \
+ \
+ store_u16x4_strided_x2(dst, dst_stride, blend); \
+ \
+ mask += 4 * mask_stride; \
+ src0 += 2 * src0_stride; \
+ src1 += 2 * src1_stride; \
+ dst += 2 * dst_stride; \
+ h -= 2; \
+ } while (h != 0); \
+ } \
+ } \
+ }
+
+// 12 bitdepth
+HBD_BLEND_A64_D16_MASK(12, (ROUND0_BITS + 2))
+// 10 bitdepth
+HBD_BLEND_A64_D16_MASK(10, ROUND0_BITS)
+// 8 bitdepth
+HBD_BLEND_A64_D16_MASK(8, ROUND0_BITS)
+
+void aom_highbd_blend_a64_d16_mask_neon(
+ uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params, const int bd) {
+ (void)conv_params;
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ if (bd == 12) {
+ highbd_12_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ subw, subh);
+ } else if (bd == 10) {
+ highbd_10_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ subw, subh);
+ } else {
+ highbd_8_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh);
+ }
+}
+
+void aom_highbd_blend_a64_mask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int w, int h, int subw, int subh, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if ((subw | subh) == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if ((subw & subh) == 1) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+ uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+ uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg =
+ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else if (subw == 1 && subh == 0) {
+ if (w >= 8) {
+ do {
+ int i = 0;
+
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 2 * i);
+ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ } else {
+ if (w >= 8) {
+ do {
+ int i = 0;
+ do {
+ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+
+ i += 8;
+ } while (i < w);
+
+ mask += 2 * mask_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else {
+ do {
+ uint8x8_t m0_2 =
+ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+ uint8x8_t m1_3 =
+ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+ uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 4 * mask_stride;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
new file mode 100644
index 0000000000..1292e20342
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_vmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ int w, int h, int bd) {
+ (void)bd;
+
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (w >= 8) {
+ do {
+ uint16x8_t m = vmovl_u8(vdup_n_u8(mask[0]));
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src0 + i);
+ uint16x8_t s1 = vld1q_u16(src1 + i);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+ vst1q_u16(dst + i, blend);
+ i += 8;
+ } while (i < w);
+
+ mask += 1;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ } while (--h != 0);
+ } else if (w == 4) {
+ do {
+ uint16x4_t m1 = vdup_n_u16((uint16_t)mask[0]);
+ uint16x4_t m2 = vdup_n_u16((uint16_t)mask[1]);
+ uint16x8_t m = vcombine_u16(m1, m2);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+ uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+ store_u16x4_strided_x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else if (w == 2 && h >= 8) {
+ do {
+ uint16x4_t m0 = vdup_n_u16(0);
+ m0 = vld1_lane_u16((uint16_t *)mask, m0, 0);
+ uint8x8_t m0_zip =
+ vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0];
+ m0 = vget_low_u16(vmovl_u8(m0_zip));
+ uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+ uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+ store_u16x2_strided_x2(dst, dst_stride, blend);
+
+ mask += 2;
+ src0 += 2 * src0_stride;
+ src1 += 2 * src1_stride;
+ dst += 2 * dst_stride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, w, h, bd);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c b/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c
new file mode 100644
index 0000000000..e25438c9b4
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t highbd_convolve8_4_s32(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
+ sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+ sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+ sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+ sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+ sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+ sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+ sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+ return sum;
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+ int32x4_t sum =
+ highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE int32x4_t highbd_convolve8_horiz4_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ const int16x8_t s2 = vextq_s16(s0, s1, 1);
+ const int16x8_t s3 = vextq_s16(s0, s1, 2);
+ const int16x8_t s4 = vextq_s16(s0, s1, 3);
+ const int16x4_t s0_lo = vget_low_s16(s0);
+ const int16x4_t s1_lo = vget_low_s16(s2);
+ const int16x4_t s2_lo = vget_low_s16(s3);
+ const int16x4_t s3_lo = vget_low_s16(s4);
+ const int16x4_t s4_lo = vget_high_s16(s0);
+ const int16x4_t s5_lo = vget_high_s16(s2);
+ const int16x4_t s6_lo = vget_high_s16(s3);
+ const int16x4_t s7_lo = vget_high_s16(s4);
+
+ return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
+ s7_lo, x_filter_0_7);
+}
+
+static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
+
+ return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE void highbd_convolve8_8_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+ int32x4_t *sum0, int32x4_t *sum1) {
+ const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+ const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+ *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
+ *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+ *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
+ *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
+}
+
+static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
+ const int16x8_t s0_hi,
+ const int16x8_t x_filter_0_7,
+ int32x4_t *sum0,
+ int32x4_t *sum1) {
+ const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
+ const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
+ const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
+ const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
+ const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
+ const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
+ const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
+
+ highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
+ sum1);
+}
+
+static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+ int32x4_t sum0, sum1;
+ highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+ int32x4_t sum0;
+ int32x4_t sum1;
+ highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
+ &sum1);
+
+ return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
+ ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride,
+ const int16_t *x_filter_ptr,
+ int x_step_q4, int w, int h, int bd) {
+ assert(w >= 4 && h >= 4);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ int16x8_t s0, s1, s2, s3;
+ load_s16_8x2(s, src_stride, &s0, &s2);
+ load_s16_8x2(s + 8, src_stride, &s1, &s3);
+
+ uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
+ uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
+
+ uint16x8_t d01 = vcombine_u16(d0, d1);
+ d01 = vminq_u16(d01, max);
+
+ vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+
+ s += 2 * src_stride;
+ d += 2 * dst_stride;
+ h -= 2;
+ } while (h > 0);
+ } else {
+ int height = h;
+
+ do {
+ int width = w;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+ int x_q4 = 0;
+
+ const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
+ int16x8_t s0, s2, s4, s6;
+ load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
+ src_x += 8;
+
+ do {
+ int16x8_t s1, s3, s5, s7;
+ load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
+
+ uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
+ uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
+ uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
+ uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s1;
+ s2 = s3;
+ s4 = s5;
+ s6 = s7;
+ src_x += 8;
+ d += 8;
+ width -= 8;
+ x_q4 += 8 * x_step_q4;
+ } while (width > 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ }
+}
+
+void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ aom_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
+ } else {
+ (void)filter_y;
+ (void)y_step_q4;
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ src -= SUBPEL_TAPS / 2 - 1;
+ highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
+ x_step_q4, w, h, bd);
+ }
+}
+
+static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
+ ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride,
+ const int16_t *y_filter_ptr, int w, int h,
+ int bd) {
+ assert(w >= 4 && h >= 4);
+ const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w == 4) {
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 =
+ highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ uint16x4_t d1 =
+ highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ uint16x4_t d2 =
+ highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ uint16x4_t d3 =
+ highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ uint16x8_t d01 = vcombine_u16(d0, d1);
+ uint16x8_t d23 = vcombine_u16(d2, d3);
+
+ d01 = vminq_u16(d01, max);
+ d23 = vminq_u16(d23, max);
+
+ vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+ vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+ vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
+ vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ do {
+ int height = h;
+ const int16_t *s = (const int16_t *)src_ptr;
+ uint16_t *d = dst_ptr;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
+ s7, y_filter);
+ uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
+ s8, y_filter);
+ uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
+ s9, y_filter);
+ uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
+ s10, y_filter);
+
+ d0 = vminq_u16(d0, max);
+ d1 = vminq_u16(d1, max);
+ d2 = vminq_u16(d2, max);
+ d3 = vminq_u16(d3, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height > 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
+ uint8_t *dst8, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ aom_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
+ x_step_q4, filter_y, y_step_q4, w, h, bd);
+ } else {
+ (void)filter_x;
+ (void)x_step_q4;
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
+ highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
+ bd);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c b/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c
new file mode 100644
index 0000000000..d28617c67e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6,
+ int16x8_t *a7) {
+ int16x8_t b0 = vaddq_s16(*a0, *a1);
+ int16x8_t b1 = vsubq_s16(*a0, *a1);
+ int16x8_t b2 = vaddq_s16(*a2, *a3);
+ int16x8_t b3 = vsubq_s16(*a2, *a3);
+ int16x8_t b4 = vaddq_s16(*a4, *a5);
+ int16x8_t b5 = vsubq_s16(*a4, *a5);
+ int16x8_t b6 = vaddq_s16(*a6, *a7);
+ int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+ int16x8_t c0 = vaddq_s16(b0, b2);
+ int16x8_t c2 = vsubq_s16(b0, b2);
+ int16x8_t c1 = vaddq_s16(b1, b3);
+ int16x8_t c3 = vsubq_s16(b1, b3);
+ int16x8_t c4 = vaddq_s16(b4, b6);
+ int16x8_t c6 = vsubq_s16(b4, b6);
+ int16x8_t c5 = vaddq_s16(b5, b7);
+ int16x8_t c7 = vsubq_s16(b5, b7);
+
+ *a0 = vaddq_s16(c0, c4);
+ *a2 = vsubq_s16(c0, c4);
+ *a7 = vaddq_s16(c1, c5);
+ *a6 = vsubq_s16(c1, c5);
+ *a3 = vaddq_s16(c2, c6);
+ *a1 = vsubq_s16(c2, c6);
+ *a4 = vaddq_s16(c3, c7);
+ *a5 = vsubq_s16(c3, c7);
+}
+
+static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
+ int16x4_t a2, int16x4_t a3,
+ int16x4_t a4, int16x4_t a5,
+ int16x4_t a6, int16x4_t a7,
+ tran_low_t *coeff) {
+ int32x4_t b0 = vaddl_s16(a0, a1);
+ int32x4_t b1 = vsubl_s16(a0, a1);
+ int32x4_t b2 = vaddl_s16(a2, a3);
+ int32x4_t b3 = vsubl_s16(a2, a3);
+ int32x4_t b4 = vaddl_s16(a4, a5);
+ int32x4_t b5 = vsubl_s16(a4, a5);
+ int32x4_t b6 = vaddl_s16(a6, a7);
+ int32x4_t b7 = vsubl_s16(a6, a7);
+
+ int32x4_t c0 = vaddq_s32(b0, b2);
+ int32x4_t c2 = vsubq_s32(b0, b2);
+ int32x4_t c1 = vaddq_s32(b1, b3);
+ int32x4_t c3 = vsubq_s32(b1, b3);
+ int32x4_t c4 = vaddq_s32(b4, b6);
+ int32x4_t c6 = vsubq_s32(b4, b6);
+ int32x4_t c5 = vaddq_s32(b5, b7);
+ int32x4_t c7 = vsubq_s32(b5, b7);
+
+ int32x4_t d0 = vaddq_s32(c0, c4);
+ int32x4_t d2 = vsubq_s32(c0, c4);
+ int32x4_t d7 = vaddq_s32(c1, c5);
+ int32x4_t d6 = vsubq_s32(c1, c5);
+ int32x4_t d3 = vaddq_s32(c2, c6);
+ int32x4_t d1 = vsubq_s32(c2, c6);
+ int32x4_t d4 = vaddq_s32(c3, c7);
+ int32x4_t d5 = vsubq_s32(c3, c7);
+
+ vst1q_s32(coeff + 0, d0);
+ vst1q_s32(coeff + 4, d1);
+ vst1q_s32(coeff + 8, d2);
+ vst1q_s32(coeff + 12, d3);
+ vst1q_s32(coeff + 16, d4);
+ vst1q_s32(coeff + 20, d5);
+ vst1q_s32(coeff + 24, d6);
+ vst1q_s32(coeff + 28, d7);
+}
+
+void aom_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x4_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+ int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride);
+ int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride);
+ int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride);
+ int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride);
+ int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride);
+ int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride);
+ int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride);
+ int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride);
+
+ // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
+ hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+ // For the second pass we need to widen to 32-bit elements, so we're
+ // processing 4 columns at a time.
+ // Skip the second transpose because it is not required.
+
+ b0 = vget_low_s16(s0);
+ b1 = vget_low_s16(s1);
+ b2 = vget_low_s16(s2);
+ b3 = vget_low_s16(s3);
+ b4 = vget_low_s16(s4);
+ b5 = vget_low_s16(s5);
+ b6 = vget_low_s16(s6);
+ b7 = vget_low_s16(s7);
+
+ hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff);
+
+ b0 = vget_high_s16(s0);
+ b1 = vget_high_s16(s1);
+ b2 = vget_high_s16(s2);
+ b3 = vget_high_s16(s3);
+ b4 = vget_high_s16(s4);
+ b5 = vget_high_s16(s5);
+ b6 = vget_high_s16(s6);
+ b7 = vget_high_s16(s7);
+
+ hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32);
+}
+
+void aom_highbd_hadamard_16x16_neon(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ // Rearrange 16x16 to 8x32 and remove stride.
+ // Top left first.
+ aom_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff);
+ // Top right.
+ aom_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64);
+ // Bottom left.
+ aom_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride,
+ coeff + 128);
+ // Bottom right.
+ aom_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride,
+ coeff + 192);
+
+ for (int i = 0; i < 16; i++) {
+ int32x4_t a0 = vld1q_s32(coeff + 4 * i);
+ int32x4_t a1 = vld1q_s32(coeff + 4 * i + 64);
+ int32x4_t a2 = vld1q_s32(coeff + 4 * i + 128);
+ int32x4_t a3 = vld1q_s32(coeff + 4 * i + 192);
+
+ int32x4_t b0 = vhaddq_s32(a0, a1);
+ int32x4_t b1 = vhsubq_s32(a0, a1);
+ int32x4_t b2 = vhaddq_s32(a2, a3);
+ int32x4_t b3 = vhsubq_s32(a2, a3);
+
+ int32x4_t c0 = vaddq_s32(b0, b2);
+ int32x4_t c1 = vaddq_s32(b1, b3);
+ int32x4_t c2 = vsubq_s32(b0, b2);
+ int32x4_t c3 = vsubq_s32(b1, b3);
+
+ vst1q_s32(coeff + 4 * i, c0);
+ vst1q_s32(coeff + 4 * i + 64, c1);
+ vst1q_s32(coeff + 4 * i + 128, c2);
+ vst1q_s32(coeff + 4 * i + 192, c3);
+ }
+}
+
+void aom_highbd_hadamard_32x32_neon(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ // Rearrange 32x32 to 16x64 and remove stride.
+ // Top left first.
+ aom_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff);
+ // Top right.
+ aom_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256);
+ // Bottom left.
+ aom_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride,
+ coeff + 512);
+ // Bottom right.
+ aom_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride,
+ coeff + 768);
+
+ for (int i = 0; i < 64; i++) {
+ int32x4_t a0 = vld1q_s32(coeff + 4 * i);
+ int32x4_t a1 = vld1q_s32(coeff + 4 * i + 256);
+ int32x4_t a2 = vld1q_s32(coeff + 4 * i + 512);
+ int32x4_t a3 = vld1q_s32(coeff + 4 * i + 768);
+
+ int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
+ int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
+ int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
+ int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
+
+ int32x4_t c0 = vaddq_s32(b0, b2);
+ int32x4_t c1 = vaddq_s32(b1, b3);
+ int32x4_t c2 = vsubq_s32(b0, b2);
+ int32x4_t c3 = vsubq_s32(b1, b3);
+
+ vst1q_s32(coeff + 4 * i, c0);
+ vst1q_s32(coeff + 4 * i + 256, c1);
+ vst1q_s32(coeff + 4 * i + 512, c2);
+ vst1q_s32(coeff + 4 * i + 768, c3);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c b/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 0000000000..dc47974c68
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,2730 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/intrapred_common.h"
+
+// -----------------------------------------------------------------------------
+// DC
+
+static INLINE void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x4_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1_u16(dst + i * stride, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ vst1q_u16(dst + i * stride + 8, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ vst1q_u16(dst + i * stride + 8, dc);
+ vst1q_u16(dst + i * stride + 16, dc);
+ vst1q_u16(dst + i * stride + 24, dc);
+ }
+}
+
+static INLINE void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
+ uint16x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u16(dst + i * stride, dc);
+ vst1q_u16(dst + i * stride + 8, dc);
+ vst1q_u16(dst + i * stride + 16, dc);
+ vst1q_u16(dst + i * stride + 24, dc);
+ vst1q_u16(dst + i * stride + 32, dc);
+ vst1q_u16(dst + i * stride + 40, dc);
+ vst1q_u16(dst + i * stride + 48, dc);
+ vst1q_u16(dst + i * stride + 56, dc);
+ }
+}
+
+static INLINE uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
+ // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so
+ // promote first.
+ const uint32x4_t b = vpaddlq_u16(a);
+#if AOM_ARCH_AARCH64
+ const uint32x4_t c = vpaddq_u32(b, b);
+ return vpaddq_u32(c, c);
+#else
+ const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b));
+ const uint32x2_t d = vpadd_u32(c, c);
+ return vcombine_u32(d, d);
+#endif
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) {
+ // Nothing to do since sum is already one vector, but saves needing to
+ // special case w=4 or h=4 cases. The combine will be zero cost for a sane
+ // compiler since vld1 already sets the top half of a vector to zero as part
+ // of the operation.
+ return vcombine_u16(vld1_u16(left), vdup_n_u16(0));
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) {
+ // Nothing to do since sum is already one vector, but saves needing to
+ // special case w=8 or h=8 cases.
+ return vld1q_u16(left);
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) {
+ const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits
+ const uint16x8_t a1 = vld1q_u16(left + 8);
+ return vaddq_u16(a0, a1); // up to 13 bits
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) {
+ const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits
+ const uint16x8_t a1 = vld1q_u16(left + 8);
+ const uint16x8_t a2 = vld1q_u16(left + 16);
+ const uint16x8_t a3 = vld1q_u16(left + 24);
+ const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits
+ const uint16x8_t b1 = vaddq_u16(a2, a3);
+ return vaddq_u16(b0, b1); // up to 14 bits
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) {
+ const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits
+ const uint16x8_t a1 = vld1q_u16(left + 8);
+ const uint16x8_t a2 = vld1q_u16(left + 16);
+ const uint16x8_t a3 = vld1q_u16(left + 24);
+ const uint16x8_t a4 = vld1q_u16(left + 32);
+ const uint16x8_t a5 = vld1q_u16(left + 40);
+ const uint16x8_t a6 = vld1q_u16(left + 48);
+ const uint16x8_t a7 = vld1q_u16(left + 56);
+ const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits
+ const uint16x8_t b1 = vaddq_u16(a2, a3);
+ const uint16x8_t b2 = vaddq_u16(a4, a5);
+ const uint16x8_t b3 = vaddq_u16(a6, a7);
+ const uint16x8_t c0 = vaddq_u16(b0, b1); // up to 14 bits
+ const uint16x8_t c1 = vaddq_u16(b2, b3);
+ return vaddq_u16(c0, c1); // up to 15 bits
+}
+
+#define HIGHBD_DC_PREDICTOR(w, h, shift) \
+ void aom_highbd_dc_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ const uint16x8_t a = highbd_dc_load_partial_sum_##w(above); \
+ const uint16x8_t l = highbd_dc_load_partial_sum_##h(left); \
+ const uint32x4_t sum = \
+ horizontal_add_and_broadcast_long_u16x8(vaddq_u16(a, l)); \
+ const uint16x4_t dc0 = vrshrn_n_u32(sum, shift); \
+ highbd_dc_store_##w##xh(dst, stride, (h), vdupq_lane_u16(dc0, 0)); \
+ }
+
+void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ // In the rectangular cases we simply extend the shorter vector to uint16x8
+ // in order to accumulate, however in the 4x4 case there is no shorter vector
+ // to extend so it is beneficial to do the whole calculation in uint16x4
+ // instead.
+ (void)bd;
+ const uint16x4_t a = vld1_u16(above); // up to 12 bits
+ const uint16x4_t l = vld1_u16(left);
+ uint16x4_t sum = vpadd_u16(a, l); // up to 13 bits
+ sum = vpadd_u16(sum, sum); // up to 14 bits
+ sum = vpadd_u16(sum, sum);
+ const uint16x4_t dc = vrshr_n_u16(sum, 3);
+ highbd_dc_store_4xh(dst, stride, 4, dc);
+}
+
+HIGHBD_DC_PREDICTOR(8, 8, 4)
+HIGHBD_DC_PREDICTOR(16, 16, 5)
+HIGHBD_DC_PREDICTOR(32, 32, 6)
+HIGHBD_DC_PREDICTOR(64, 64, 7)
+
+#undef HIGHBD_DC_PREDICTOR
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+ int multiplier, int shift2) {
+ const int interm = num >> shift1;
+ return interm * multiplier >> shift2;
+}
+
+#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
+#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
+#define HIGHBD_DC_SHIFT2 17
+
+static INLINE int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1,
+ uint32_t multiplier) {
+ return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier,
+ HIGHBD_DC_SHIFT2);
+}
+
+#undef HIGHBD_DC_SHIFT2
+
+#define HIGHBD_DC_PREDICTOR_RECT(w, h, q, shift, mult) \
+ void aom_highbd_dc_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above); \
+ uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left); \
+ uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above); \
+ int sum = horizontal_add_u16x8(sum_vec); \
+ int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
+ highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0)); \
+ }
+
+HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(4, 16, , 2, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(8, 32, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(16, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(16, 64, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(32, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(64, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
+
+#undef HIGHBD_DC_PREDICTOR_RECT
+#undef HIGHBD_DC_MULTIPLIER_1X2
+#undef HIGHBD_DC_MULTIPLIER_1X4
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+#define HIGHBD_DC_PREDICTOR_128(w, h, q) \
+ void aom_highbd_dc_128_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)above; \
+ (void)bd; \
+ (void)left; \
+ highbd_dc_store_##w##xh(dst, stride, (h), \
+ vdup##q##_n_u16(0x80 << (bd - 8))); \
+ }
+
+HIGHBD_DC_PREDICTOR_128(4, 4, )
+HIGHBD_DC_PREDICTOR_128(4, 8, )
+HIGHBD_DC_PREDICTOR_128(4, 16, )
+HIGHBD_DC_PREDICTOR_128(8, 4, q)
+HIGHBD_DC_PREDICTOR_128(8, 8, q)
+HIGHBD_DC_PREDICTOR_128(8, 16, q)
+HIGHBD_DC_PREDICTOR_128(8, 32, q)
+HIGHBD_DC_PREDICTOR_128(16, 4, q)
+HIGHBD_DC_PREDICTOR_128(16, 8, q)
+HIGHBD_DC_PREDICTOR_128(16, 16, q)
+HIGHBD_DC_PREDICTOR_128(16, 32, q)
+HIGHBD_DC_PREDICTOR_128(16, 64, q)
+HIGHBD_DC_PREDICTOR_128(32, 8, q)
+HIGHBD_DC_PREDICTOR_128(32, 16, q)
+HIGHBD_DC_PREDICTOR_128(32, 32, q)
+HIGHBD_DC_PREDICTOR_128(32, 64, q)
+HIGHBD_DC_PREDICTOR_128(64, 16, q)
+HIGHBD_DC_PREDICTOR_128(64, 32, q)
+HIGHBD_DC_PREDICTOR_128(64, 64, q)
+
+#undef HIGHBD_DC_PREDICTOR_128
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+static INLINE uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) {
+ const uint16x4_t a = vld1_u16(left); // up to 12 bits
+ const uint16x4_t b = vpadd_u16(a, a); // up to 13 bits
+ return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) {
+ return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) {
+ return horizontal_add_and_broadcast_long_u16x8(
+ highbd_dc_load_partial_sum_16(left));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) {
+ return horizontal_add_and_broadcast_long_u16x8(
+ highbd_dc_load_partial_sum_32(left));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) {
+ return horizontal_add_and_broadcast_long_u16x8(
+ highbd_dc_load_partial_sum_64(left));
+}
+
+#define DC_PREDICTOR_LEFT(w, h, shift, q) \
+ void aom_highbd_dc_left_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)above; \
+ (void)bd; \
+ const uint32x4_t sum = highbd_dc_load_sum_##h(left); \
+ const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \
+ highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
+ }
+
+DC_PREDICTOR_LEFT(4, 4, 2, )
+DC_PREDICTOR_LEFT(4, 8, 3, )
+DC_PREDICTOR_LEFT(4, 16, 4, )
+DC_PREDICTOR_LEFT(8, 4, 2, q)
+DC_PREDICTOR_LEFT(8, 8, 3, q)
+DC_PREDICTOR_LEFT(8, 16, 4, q)
+DC_PREDICTOR_LEFT(8, 32, 5, q)
+DC_PREDICTOR_LEFT(16, 4, 2, q)
+DC_PREDICTOR_LEFT(16, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 16, 4, q)
+DC_PREDICTOR_LEFT(16, 32, 5, q)
+DC_PREDICTOR_LEFT(16, 64, 6, q)
+DC_PREDICTOR_LEFT(32, 8, 3, q)
+DC_PREDICTOR_LEFT(32, 16, 4, q)
+DC_PREDICTOR_LEFT(32, 32, 5, q)
+DC_PREDICTOR_LEFT(32, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 16, 4, q)
+DC_PREDICTOR_LEFT(64, 32, 5, q)
+DC_PREDICTOR_LEFT(64, 64, 6, q)
+
+#undef DC_PREDICTOR_LEFT
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+#define DC_PREDICTOR_TOP(w, h, shift, q) \
+ void aom_highbd_dc_top_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ (void)left; \
+ const uint32x4_t sum = highbd_dc_load_sum_##w(above); \
+ const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \
+ highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
+ }
+
+DC_PREDICTOR_TOP(4, 4, 2, )
+DC_PREDICTOR_TOP(4, 8, 2, )
+DC_PREDICTOR_TOP(4, 16, 2, )
+DC_PREDICTOR_TOP(8, 4, 3, q)
+DC_PREDICTOR_TOP(8, 8, 3, q)
+DC_PREDICTOR_TOP(8, 16, 3, q)
+DC_PREDICTOR_TOP(8, 32, 3, q)
+DC_PREDICTOR_TOP(16, 4, 4, q)
+DC_PREDICTOR_TOP(16, 8, 4, q)
+DC_PREDICTOR_TOP(16, 16, 4, q)
+DC_PREDICTOR_TOP(16, 32, 4, q)
+DC_PREDICTOR_TOP(16, 64, 4, q)
+DC_PREDICTOR_TOP(32, 8, 5, q)
+DC_PREDICTOR_TOP(32, 16, 5, q)
+DC_PREDICTOR_TOP(32, 32, 5, q)
+DC_PREDICTOR_TOP(32, 64, 5, q)
+DC_PREDICTOR_TOP(64, 16, 6, q)
+DC_PREDICTOR_TOP(64, 32, 6, q)
+DC_PREDICTOR_TOP(64, 64, 6, q)
+
+#undef DC_PREDICTOR_TOP
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+#define HIGHBD_V_NXM(W, H) \
+ void aom_highbd_v_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)left; \
+ (void)bd; \
+ vertical##W##xh_neon(dst, stride, above, H); \
+ }
+
+static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
+ uint16x8x2_t x;
+ // Clang/gcc uses ldp here.
+ x.val[0] = vld1q_u16(ptr);
+ x.val[1] = vld1q_u16(ptr + 8);
+ return x;
+}
+
+static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
+ vst1q_u16(ptr, x.val[0]);
+ vst1q_u16(ptr + 8, x.val[1]);
+}
+
+static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const above, int height) {
+ const uint16x4_t row = vld1_u16(above);
+ int y = height;
+ do {
+ vst1_u16(dst, row);
+ vst1_u16(dst + stride, row);
+ dst += stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const above, int height) {
+ const uint16x8_t row = vld1q_u16(above);
+ int y = height;
+ do {
+ vst1q_u16(dst, row);
+ vst1q_u16(dst + stride, row);
+ dst += stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const above, int height) {
+ const uint16x8x2_t row = load_uint16x8x2(above);
+ int y = height;
+ do {
+ store_uint16x8x2(dst, row);
+ store_uint16x8x2(dst + stride, row);
+ dst += stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
+ uint16x8x4_t x;
+ // Clang/gcc uses ldp here.
+ x.val[0] = vld1q_u16(ptr);
+ x.val[1] = vld1q_u16(ptr + 8);
+ x.val[2] = vld1q_u16(ptr + 16);
+ x.val[3] = vld1q_u16(ptr + 24);
+ return x;
+}
+
+static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
+ vst1q_u16(ptr, x.val[0]);
+ vst1q_u16(ptr + 8, x.val[1]);
+ vst1q_u16(ptr + 16, x.val[2]);
+ vst1q_u16(ptr + 24, x.val[3]);
+}
+
+static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const above, int height) {
+ const uint16x8x4_t row = load_uint16x8x4(above);
+ int y = height;
+ do {
+ store_uint16x8x4(dst, row);
+ store_uint16x8x4(dst + stride, row);
+ dst += stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const above, int height) {
+ uint16_t *dst32 = dst + 32;
+ const uint16x8x4_t row = load_uint16x8x4(above);
+ const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
+ int y = height;
+ do {
+ store_uint16x8x4(dst, row);
+ store_uint16x8x4(dst32, row32);
+ store_uint16x8x4(dst + stride, row);
+ store_uint16x8x4(dst32 + stride, row32);
+ dst += stride << 1;
+ dst32 += stride << 1;
+ y -= 2;
+ } while (y != 0);
+}
+
+HIGHBD_V_NXM(4, 4)
+HIGHBD_V_NXM(4, 8)
+HIGHBD_V_NXM(4, 16)
+
+HIGHBD_V_NXM(8, 4)
+HIGHBD_V_NXM(8, 8)
+HIGHBD_V_NXM(8, 16)
+HIGHBD_V_NXM(8, 32)
+
+HIGHBD_V_NXM(16, 4)
+HIGHBD_V_NXM(16, 8)
+HIGHBD_V_NXM(16, 16)
+HIGHBD_V_NXM(16, 32)
+HIGHBD_V_NXM(16, 64)
+
+HIGHBD_V_NXM(32, 8)
+HIGHBD_V_NXM(32, 16)
+HIGHBD_V_NXM(32, 32)
+HIGHBD_V_NXM(32, 64)
+
+HIGHBD_V_NXM(64, 16)
+HIGHBD_V_NXM(64, 32)
+HIGHBD_V_NXM(64, 64)
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+static INLINE void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ uint16x4_t left) {
+ vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0));
+ vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1));
+ vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2));
+ vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
+ uint16x4_t left) {
+ vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0));
+ vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1));
+ vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2));
+ vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) {
+ vst1q_u16(dst + 0, left);
+ vst1q_u16(dst + 8, left);
+}
+
+static INLINE void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
+ uint16x4_t left) {
+ highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+ highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+ highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+ highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) {
+ vst1q_u16(dst + 0, left);
+ vst1q_u16(dst + 8, left);
+ vst1q_u16(dst + 16, left);
+ vst1q_u16(dst + 24, left);
+}
+
+static INLINE void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
+ uint16x4_t left) {
+ highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+ highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+ highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+ highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
+ vst1q_u16(dst + 0, left);
+ vst1q_u16(dst + 8, left);
+ vst1q_u16(dst + 16, left);
+ vst1q_u16(dst + 24, left);
+ vst1q_u16(dst + 32, left);
+ vst1q_u16(dst + 40, left);
+ vst1q_u16(dst + 48, left);
+ vst1q_u16(dst + 56, left);
+}
+
+static INLINE void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride,
+ uint16x4_t left) {
+ highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+ highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+ highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+ highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ highbd_h_store_4x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ uint16x8_t l = vld1q_u16(left);
+ highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l));
+ highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ highbd_h_store_8x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ uint16x8_t l = vld1q_u16(left);
+ highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l));
+ highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ highbd_h_store_16x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ uint16x8_t l = vld1q_u16(left);
+ highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l));
+ highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ uint16x8_t l = vld1q_u16(left);
+ highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l));
+ highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+// For cases where height >= 16 we use pairs of loads to get LDP instructions.
+#define HIGHBD_H_WXH_LARGE(w, h) \
+ void aom_highbd_h_predictor_##w##x##h##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)above; \
+ (void)bd; \
+ for (int i = 0; i < (h) / 16; ++i) { \
+ uint16x8_t l0 = vld1q_u16(left + 0); \
+ uint16x8_t l1 = vld1q_u16(left + 8); \
+ highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0)); \
+ highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0)); \
+ highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1)); \
+ highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \
+ left += 16; \
+ dst += 16 * stride; \
+ } \
+ }
+
+HIGHBD_H_WXH_LARGE(4, 16)
+HIGHBD_H_WXH_LARGE(8, 16)
+HIGHBD_H_WXH_LARGE(8, 32)
+HIGHBD_H_WXH_LARGE(16, 16)
+HIGHBD_H_WXH_LARGE(16, 32)
+HIGHBD_H_WXH_LARGE(16, 64)
+HIGHBD_H_WXH_LARGE(32, 16)
+HIGHBD_H_WXH_LARGE(32, 32)
+HIGHBD_H_WXH_LARGE(32, 64)
+HIGHBD_H_WXH_LARGE(64, 16)
+HIGHBD_H_WXH_LARGE(64, 32)
+HIGHBD_H_WXH_LARGE(64, 64)
+
+#undef HIGHBD_H_WXH_LARGE
+
+// -----------------------------------------------------------------------------
+// PAETH
+
+static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ int width, int height) {
+ const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+ uint16x8_t top;
+ if (width == 4) {
+ top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
+ } else { // width == 8
+ top = vld1q_u16(top_row);
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint16x8_t left = vdupq_n_u16(left_column[y]);
+
+ const uint16x8_t left_dist = vabdq_u16(top, top_left);
+ const uint16x8_t top_dist = vabdq_u16(left, top_left);
+ const uint16x8_t top_left_dist =
+ vabdq_u16(vaddq_u16(top, left), top_left_x2);
+
+ const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+ const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+ const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint16x8_t result = vbslq_u16(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbslq_u16(left_or_top_mask, result, top_left);
+
+ if (width == 4) {
+ vst1_u16(dest, vget_low_u16(result));
+ } else { // width == 8
+ vst1q_u16(dest, result);
+ }
+ dest += stride;
+ }
+}
+
+#define HIGHBD_PAETH_NXM(W, H) \
+ void aom_highbd_paeth_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
+ }
+
+HIGHBD_PAETH_NXM(4, 4)
+HIGHBD_PAETH_NXM(4, 8)
+HIGHBD_PAETH_NXM(4, 16)
+HIGHBD_PAETH_NXM(8, 4)
+HIGHBD_PAETH_NXM(8, 8)
+HIGHBD_PAETH_NXM(8, 16)
+HIGHBD_PAETH_NXM(8, 32)
+
+// Select the closest values and collect them.
+static INLINE uint16x8_t select_paeth(const uint16x8_t top,
+ const uint16x8_t left,
+ const uint16x8_t top_left,
+ const uint16x8_t left_le_top,
+ const uint16x8_t left_le_top_left,
+ const uint16x8_t top_le_top_left) {
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ const uint16x8_t result = vbslq_u16(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ return vbslq_u16(left_or_top_mask, result, top_left);
+}
+
+#define PAETH_PREDICTOR(num) \
+ do { \
+ const uint16x8_t left_dist = vabdq_u16(top[num], top_left); \
+ const uint16x8_t top_left_dist = \
+ vabdq_u16(vaddq_u16(top[num], left), top_left_x2); \
+ const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); \
+ const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); \
+ const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); \
+ const uint16x8_t result = \
+ select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
+ top_le_top_left); \
+ vst1q_u16(dest + (num * 8), result); \
+ } while (0)
+
+#define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
+
+static INLINE void highbd_paeth16_plus_x_h_neon(
+ uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
+ const uint16_t *const left_column, int width, int height) {
+ const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+ uint16x8_t top[8];
+ top[0] = LOAD_TOP_ROW(0);
+ top[1] = LOAD_TOP_ROW(1);
+ if (width > 16) {
+ top[2] = LOAD_TOP_ROW(2);
+ top[3] = LOAD_TOP_ROW(3);
+ if (width == 64) {
+ top[4] = LOAD_TOP_ROW(4);
+ top[5] = LOAD_TOP_ROW(5);
+ top[6] = LOAD_TOP_ROW(6);
+ top[7] = LOAD_TOP_ROW(7);
+ }
+ }
+
+ for (int y = 0; y < height; ++y) {
+ const uint16x8_t left = vdupq_n_u16(left_column[y]);
+ const uint16x8_t top_dist = vabdq_u16(left, top_left);
+ PAETH_PREDICTOR(0);
+ PAETH_PREDICTOR(1);
+ if (width > 16) {
+ PAETH_PREDICTOR(2);
+ PAETH_PREDICTOR(3);
+ if (width == 64) {
+ PAETH_PREDICTOR(4);
+ PAETH_PREDICTOR(5);
+ PAETH_PREDICTOR(6);
+ PAETH_PREDICTOR(7);
+ }
+ }
+ dest += stride;
+ }
+}
+
+#define HIGHBD_PAETH_NXM_WIDE(W, H) \
+ void aom_highbd_paeth_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
+ }
+
+HIGHBD_PAETH_NXM_WIDE(16, 4)
+HIGHBD_PAETH_NXM_WIDE(16, 8)
+HIGHBD_PAETH_NXM_WIDE(16, 16)
+HIGHBD_PAETH_NXM_WIDE(16, 32)
+HIGHBD_PAETH_NXM_WIDE(16, 64)
+HIGHBD_PAETH_NXM_WIDE(32, 8)
+HIGHBD_PAETH_NXM_WIDE(32, 16)
+HIGHBD_PAETH_NXM_WIDE(32, 32)
+HIGHBD_PAETH_NXM_WIDE(32, 64)
+HIGHBD_PAETH_NXM_WIDE(64, 16)
+HIGHBD_PAETH_NXM_WIDE(64, 32)
+HIGHBD_PAETH_NXM_WIDE(64, 64)
+
+// -----------------------------------------------------------------------------
+// SMOOTH
+
+// 256 - v = vneg_s8(v)
+static INLINE uint16x4_t negate_s8(const uint16x4_t v) {
+ return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
+}
+
+static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t top_right = top_row[3];
+ const uint16_t bottom_left = left_column[height - 1];
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+ const uint16x4_t top_v = vld1_u16(top_row);
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+ const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16);
+ const uint16x4_t scaled_weights_x = negate_s8(weights_x_v);
+ const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+
+ for (int y = 0; y < height; ++y) {
+ // Each variable in the running summation is named for the last item to be
+ // accumulated.
+ const uint32x4_t weighted_top =
+ vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
+ const uint32x4_t weighted_left =
+ vmlal_n_u16(weighted_top, weights_x_v, left_column[y]);
+ const uint32x4_t weighted_bl =
+ vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
+
+ const uint16x4_t pred =
+ vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+ vst1_u16(dst, pred);
+ dst += stride;
+ }
+}
+
+// Common code between 8xH and [16|32|64]xH.
+static INLINE void highbd_calculate_pred8(
+ uint16_t *dst, const uint32x4_t weighted_corners_low,
+ const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
+ const uint16x4x2_t weights_x, const uint16_t left_y,
+ const uint16_t weight_y) {
+ // Each variable in the running summation is named for the last item to be
+ // accumulated.
+ const uint32x4_t weighted_top_low =
+ vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
+ const uint32x4_t weighted_edges_low =
+ vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
+
+ const uint16x4_t pred_low =
+ vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+ vst1_u16(dst, pred_low);
+
+ const uint32x4_t weighted_top_high =
+ vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
+ const uint32x4_t weighted_edges_high =
+ vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
+
+ const uint16x4_t pred_high =
+ vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+ vst1_u16(dst + 4, pred_high);
+}
+
+static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t top_right = top_row[7];
+ const uint16_t bottom_left = left_column[height - 1];
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+ const uint16x4x2_t top_vals = { { vld1_u16(top_row),
+ vld1_u16(top_row + 4) } };
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+ const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
+ vld1_u16(smooth_weights_u16 + 8) } };
+ const uint32x4_t weighted_tr_low =
+ vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
+ const uint32x4_t weighted_tr_high =
+ vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
+
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+ const uint32x4_t weighted_corners_low =
+ vaddq_u32(weighted_bl, weighted_tr_low);
+ const uint32x4_t weighted_corners_high =
+ vaddq_u32(weighted_bl, weighted_tr_high);
+ highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high,
+ top_vals, weights_x, left_column[y], weights_y[y]);
+ dst += stride;
+ }
+}
+
+#define HIGHBD_SMOOTH_NXM(W, H) \
+ void aom_highbd_smooth_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_NXM(4, 4)
+HIGHBD_SMOOTH_NXM(4, 8)
+HIGHBD_SMOOTH_NXM(8, 4)
+HIGHBD_SMOOTH_NXM(8, 8)
+HIGHBD_SMOOTH_NXM(4, 16)
+HIGHBD_SMOOTH_NXM(8, 16)
+HIGHBD_SMOOTH_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_PREDICTOR(W) \
+ static void highbd_smooth_##W##xh_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \
+ const uint16_t *const left_column, const int height) { \
+ const uint16_t top_right = top_row[(W)-1]; \
+ const uint16_t bottom_left = left_column[height - 1]; \
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \
+ \
+ /* Precompute weighted values that don't vary with |y|. */ \
+ uint32x4_t weighted_tr_low[(W) >> 3]; \
+ uint32x4_t weighted_tr_high[(W) >> 3]; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const int x = i << 3; \
+ const uint16x4_t weights_x_low = \
+ vld1_u16(smooth_weights_u16 + (W)-4 + x); \
+ weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right); \
+ const uint16x4_t weights_x_high = \
+ vld1_u16(smooth_weights_u16 + (W) + x); \
+ weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \
+ } \
+ \
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \
+ for (int y = 0; y < height; ++y) { \
+ const uint32x4_t weighted_bl = \
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \
+ uint16_t *dst_x = dst; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const int x = i << 3; \
+ const uint16x4x2_t top_vals = { { vld1_u16(top_row + x), \
+ vld1_u16(top_row + x + 4) } }; \
+ const uint32x4_t weighted_corners_low = \
+ vaddq_u32(weighted_bl, weighted_tr_low[i]); \
+ const uint32x4_t weighted_corners_high = \
+ vaddq_u32(weighted_bl, weighted_tr_high[i]); \
+ /* Accumulate weighted edge values and store. */ \
+ const uint16x4x2_t weights_x = { \
+ { vld1_u16(smooth_weights_u16 + (W)-4 + x), \
+ vld1_u16(smooth_weights_u16 + (W) + x) } \
+ }; \
+ highbd_calculate_pred8(dst_x, weighted_corners_low, \
+ weighted_corners_high, top_vals, weights_x, \
+ left_column[y], weights_y[y]); \
+ dst_x += 8; \
+ } \
+ dst += stride; \
+ } \
+ }
+
+HIGHBD_SMOOTH_PREDICTOR(16)
+HIGHBD_SMOOTH_PREDICTOR(32)
+HIGHBD_SMOOTH_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_PREDICTOR
+
+#define HIGHBD_SMOOTH_NXM_WIDE(W, H) \
+ void aom_highbd_smooth_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_NXM_WIDE
+
+static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t bottom_left = left_column[height - 1];
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+ const uint16x4_t top_v = vld1_u16(top_row);
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+ const uint32x4_t weighted_top =
+ vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
+ vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE));
+
+ dst += stride;
+ }
+}
+
+static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t bottom_left = left_column[height - 1];
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+ const uint16x4_t top_low = vld1_u16(top_row);
+ const uint16x4_t top_high = vld1_u16(top_row + 4);
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_bl =
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+ const uint32x4_t weighted_top_low =
+ vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
+ vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));
+
+ const uint32x4_t weighted_top_high =
+ vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
+ vst1_u16(dst + 4,
+ vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE));
+ dst += stride;
+ }
+}
+
+#define HIGHBD_SMOOTH_V_NXM(W, H) \
+ void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_V_NXM(4, 4)
+HIGHBD_SMOOTH_V_NXM(4, 8)
+HIGHBD_SMOOTH_V_NXM(4, 16)
+HIGHBD_SMOOTH_V_NXM(8, 4)
+HIGHBD_SMOOTH_V_NXM(8, 8)
+HIGHBD_SMOOTH_V_NXM(8, 16)
+HIGHBD_SMOOTH_V_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_V_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_V_PREDICTOR(W) \
+ static void highbd_smooth_v_##W##xh_neon( \
+ uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row, \
+ const uint16_t *const left_column, const int height) { \
+ const uint16_t bottom_left = left_column[height - 1]; \
+ const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \
+ \
+ uint16x4x2_t top_vals[(W) >> 3]; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const int x = i << 3; \
+ top_vals[i].val[0] = vld1_u16(top_row + x); \
+ top_vals[i].val[1] = vld1_u16(top_row + x + 4); \
+ } \
+ \
+ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \
+ for (int y = 0; y < height; ++y) { \
+ const uint32x4_t weighted_bl = \
+ vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \
+ \
+ uint16_t *dst_x = dst; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const uint32x4_t weighted_top_low = \
+ vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]); \
+ vst1_u16(dst_x, \
+ vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); \
+ \
+ const uint32x4_t weighted_top_high = \
+ vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]); \
+ vst1_u16(dst_x + 4, \
+ vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
+ dst_x += 8; \
+ } \
+ dst += stride; \
+ } \
+ }
+
+HIGHBD_SMOOTH_V_PREDICTOR(16)
+HIGHBD_SMOOTH_V_PREDICTOR(32)
+HIGHBD_SMOOTH_V_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_V_PREDICTOR
+
+#define HIGHBD_SMOOTH_V_NXM_WIDE(W, H) \
+ void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_V_NXM_WIDE
+
+static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t top_right = top_row[3];
+
+ const uint16x4_t weights_x = vld1_u16(smooth_weights_u16);
+ const uint16x4_t scaled_weights_x = negate_s8(weights_x);
+
+ const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+ for (int y = 0; y < height; ++y) {
+ const uint32x4_t weighted_left =
+ vmlal_n_u16(weighted_tr, weights_x, left_column[y]);
+ vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE));
+ dst += stride;
+ }
+}
+
+static INLINE void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *const top_row,
+ const uint16_t *const left_column,
+ const int height) {
+ const uint16_t top_right = top_row[7];
+
+ const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
+ vld1_u16(smooth_weights_u16 + 8) } };
+
+ const uint32x4_t weighted_tr_low =
+ vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
+ const uint32x4_t weighted_tr_high =
+ vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
+
+ for (int y = 0; y < height; ++y) {
+ const uint16_t left_y = left_column[y];
+ const uint32x4_t weighted_left_low =
+ vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
+ vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));
+
+ const uint32x4_t weighted_left_high =
+ vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
+ vst1_u16(dst + 4,
+ vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE));
+ dst += stride;
+ }
+}
+
+#define HIGHBD_SMOOTH_H_NXM(W, H) \
+ void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_H_NXM(4, 4)
+HIGHBD_SMOOTH_H_NXM(4, 8)
+HIGHBD_SMOOTH_H_NXM(4, 16)
+HIGHBD_SMOOTH_H_NXM(8, 4)
+HIGHBD_SMOOTH_H_NXM(8, 8)
+HIGHBD_SMOOTH_H_NXM(8, 16)
+HIGHBD_SMOOTH_H_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_H_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_H_PREDICTOR(W) \
+ void highbd_smooth_h_##W##xh_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \
+ const uint16_t *const left_column, const int height) { \
+ const uint16_t top_right = top_row[(W)-1]; \
+ \
+ uint16x4_t weights_x_low[(W) >> 3]; \
+ uint16x4_t weights_x_high[(W) >> 3]; \
+ uint32x4_t weighted_tr_low[(W) >> 3]; \
+ uint32x4_t weighted_tr_high[(W) >> 3]; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const int x = i << 3; \
+ weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x); \
+ weighted_tr_low[i] = \
+ vmull_n_u16(negate_s8(weights_x_low[i]), top_right); \
+ weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x); \
+ weighted_tr_high[i] = \
+ vmull_n_u16(negate_s8(weights_x_high[i]), top_right); \
+ } \
+ \
+ for (int y = 0; y < height; ++y) { \
+ uint16_t *dst_x = dst; \
+ const uint16_t left_y = left_column[y]; \
+ for (int i = 0; i < (W) >> 3; ++i) { \
+ const uint32x4_t weighted_left_low = \
+ vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y); \
+ vst1_u16(dst_x, \
+ vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); \
+ \
+ const uint32x4_t weighted_left_high = \
+ vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y); \
+ vst1_u16(dst_x + 4, \
+ vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
+ dst_x += 8; \
+ } \
+ dst += stride; \
+ } \
+ }
+
+HIGHBD_SMOOTH_H_PREDICTOR(16)
+HIGHBD_SMOOTH_H_PREDICTOR(32)
+HIGHBD_SMOOTH_H_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_H_PREDICTOR
+
+#define HIGHBD_SMOOTH_H_NXM_WIDE(W, H) \
+ void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \
+ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+ const uint16_t *left, int bd) { \
+ (void)bd; \
+ highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_H_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// Z1
+
+static int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+static int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 };
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0,
+ uint16x4_t a1,
+ int shift) {
+ // The C implementation of the z1 predictor uses (32 - shift) and a right
+ // shift by 5, however we instead double shift to avoid an unnecessary right
+ // shift by 1.
+ uint32x4_t res = vmull_n_u16(a1, shift);
+ res = vmlal_n_u16(res, a0, 64 - shift);
+ return vrshrn_n_u32(res, 6);
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
+ uint16x8_t a1,
+ int shift) {
+ return vcombine_u16(
+ highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift),
+ highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
+}
+
+static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
+ ptrdiff_t stride, int bw,
+ int bh,
+ const uint16_t *above,
+ int dx) {
+ assert(bw % 4 == 0);
+ assert(bh % 4 == 0);
+ assert(dx > 0);
+
+ const int max_base_x = (bw + bh) - 1;
+ const int above_max = above[max_base_x];
+
+ const int16x8_t iota1x8 = vld1q_s16(iota1_s16);
+ const int16x4_t iota1x4 = vget_low_s16(iota1x8);
+
+ int x = dx;
+ int r = 0;
+ do {
+ const int base = x >> 6;
+ if (base >= max_base_x) {
+ for (int i = r; i < bh; ++i) {
+ aom_memset16(dst, above_max, bw);
+ dst += stride;
+ }
+ return;
+ }
+
+ // The C implementation of the z1 predictor when not upsampling uses:
+ // ((x & 0x3f) >> 1)
+ // The right shift is unnecessary here since we instead shift by +1 later,
+ // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
+ const int shift = x & 0x3e;
+
+ if (bw == 4) {
+ const uint16x4_t a0 = vld1_u16(&above[base]);
+ const uint16x4_t a1 = vld1_u16(&above[base + 1]);
+ const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift);
+ const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4);
+ const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
+ vst1_u16(dst, res);
+ } else {
+ int c = 0;
+ do {
+ const uint16x8_t a0 = vld1q_u16(&above[base + c]);
+ const uint16x8_t a1 = vld1q_u16(&above[base + c + 1]);
+ const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a0, a1, shift);
+ const uint16x8_t cmp =
+ vcgtq_s16(vdupq_n_s16(max_base_x - base - c), iota1x8);
+ const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
+ vst1q_u16(dst + c, res);
+ c += 8;
+ } while (c < bw);
+ }
+
+ dst += stride;
+ x += dx;
+ } while (++r < bh);
+}
+
+static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst,
+ ptrdiff_t stride, int bw,
+ int bh,
+ const uint16_t *above,
+ int dx) {
+ assert(bw % 4 == 0);
+ assert(bh % 4 == 0);
+ assert(dx > 0);
+
+ const int max_base_x = ((bw + bh) - 1) << 1;
+ const int above_max = above[max_base_x];
+
+ const int16x8_t iota2x8 = vld1q_s16(iota2_s16);
+ const int16x4_t iota2x4 = vget_low_s16(iota2x8);
+
+ int x = dx;
+ int r = 0;
+ do {
+ const int base = x >> 5;
+ if (base >= max_base_x) {
+ for (int i = r; i < bh; ++i) {
+ aom_memset16(dst, above_max, bw);
+ dst += stride;
+ }
+ return;
+ }
+
+ // The C implementation of the z1 predictor when upsampling uses:
+ // (((x << 1) & 0x3f) >> 1)
+ // The right shift is unnecessary here since we instead shift by +1 later,
+ // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
+ const int shift = (x << 1) & 0x3e;
+
+ if (bw == 4) {
+ const uint16x4x2_t a01 = vld2_u16(&above[base]);
+ const uint16x4_t val =
+ highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift);
+ const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4);
+ const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
+ vst1_u16(dst, res);
+ } else {
+ int c = 0;
+ do {
+ const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]);
+ const uint16x8_t val =
+ highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift);
+ const uint16x8_t cmp =
+ vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8);
+ const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
+ vst1q_u16(dst + c, res);
+ c += 8;
+ } while (c < bw);
+ }
+
+ dst += stride;
+ x += dx;
+ } while (++r < bh);
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int dx, int dy, int bd) {
+ (void)left;
+ (void)dy;
+ (void)bd;
+ assert(dy == 1);
+
+ if (upsample_above) {
+ highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx);
+ } else {
+ highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Z2
+
+#if AOM_ARCH_AARCH64
+// Incrementally shift more elements from `above` into the result, merging with
+// existing `left` elements.
+// X0, X1, X2, X3
+// Y0, X0, X1, X2
+// Y0, Y1, X0, X1
+// Y0, Y1, Y2, X0
+// Y0, Y1, Y2, Y3
+// clang-format off
+static const uint8_t z2_merge_shuffles_u16x4[5][8] = {
+ { 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 8, 9, 10, 11, 12, 13 },
+ { 0, 1, 2, 3, 8, 9, 10, 11 },
+ { 0, 1, 2, 3, 4, 5, 8, 9 },
+ { 0, 1, 2, 3, 4, 5, 6, 7 },
+};
+// clang-format on
+
+// Incrementally shift more elements from `above` into the result, merging with
+// existing `left` elements.
+// X0, X1, X2, X3, X4, X5, X6, X7
+// Y0, X0, X1, X2, X3, X4, X5, X6
+// Y0, Y1, X0, X1, X2, X3, X4, X5
+// Y0, Y1, Y2, X0, X1, X2, X3, X4
+// Y0, Y1, Y2, Y3, X0, X1, X2, X3
+// Y0, Y1, Y2, Y3, Y4, X0, X1, X2
+// Y0, Y1, Y2, Y3, Y4, Y5, X0, X1
+// Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0
+// Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
+// clang-format off
+static const uint8_t z2_merge_shuffles_u16x8[9][16] = {
+ { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+ { 0, 1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
+ { 0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 },
+ { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+};
+// clang-format on
+
+// clang-format off
+static const uint16_t z2_y_iter_masks_u16x4[5][4] = {
+ { 0U, 0U, 0U, 0U },
+ { 0xffffU, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU },
+};
+// clang-format on
+
+// clang-format off
+static const uint16_t z2_y_iter_masks_u16x8[9][8] = {
+ { 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U },
+ { 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U },
+ { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU },
+};
+// clang-format on
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8(
+ const uint16x8_t left_data, const int16x4_t indices, int base, int n) {
+ // Need to adjust indices to operate on 0-based indices rather than
+ // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+ // indices so we can use a tbl instruction (which only operates on bytes).
+ uint8x8_t left_indices =
+ vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
+ left_indices = vtrn1_u8(left_indices, left_indices);
+ left_indices = vadd_u8(left_indices, left_indices);
+ left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
+ const uint16x4_t ret = vreinterpret_u16_u8(
+ vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices));
+ return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16(
+ const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) {
+ // Need to adjust indices to operate on 0-based indices rather than
+ // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+ // indices so we can use a tbl instruction (which only operates on bytes).
+ uint8x8_t left_indices =
+ vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
+ left_indices = vtrn1_u8(left_indices, left_indices);
+ left_indices = vadd_u8(left_indices, left_indices);
+ left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
+ uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
+ vreinterpretq_u8_u16(left_data.val[1]) } };
+ const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices));
+ return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+ const uint16x8_t left_data, const int16x8_t indices, int base, int n) {
+ // Need to adjust indices to operate on 0-based indices rather than
+ // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+ // indices so we can use a tbl instruction (which only operates on bytes).
+ uint8x16_t left_indices =
+ vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
+ left_indices = vtrn1q_u8(left_indices, left_indices);
+ left_indices = vaddq_u8(left_indices, left_indices);
+ left_indices =
+ vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
+ const uint16x8_t ret = vreinterpretq_u16_u8(
+ vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices));
+ return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+ const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) {
+ // Need to adjust indices to operate on 0-based indices rather than
+ // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+ // indices so we can use a tbl instruction (which only operates on bytes).
+ uint8x16_t left_indices =
+ vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
+ left_indices = vtrn1q_u8(left_indices, left_indices);
+ left_indices = vaddq_u8(left_indices, left_indices);
+ left_indices =
+ vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
+ uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
+ vreinterpretq_u8_u16(left_data.val[1]) } };
+ const uint16x8_t ret =
+ vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices));
+ return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
+}
+#endif // AOM_ARCH_AARCH64
+
+static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
+ const uint16_t *left, const int16x4_t indices, int n) {
+ assert(n > 0);
+ assert(n <= 4);
+ // Load two elements at a time and then uzp them into separate vectors, to
+ // reduce the number of memory accesses.
+ uint32x2_t ret0_u32 = vdup_n_u32(0);
+ uint32x2_t ret1_u32 = vdup_n_u32(0);
+
+ // Use a single vget_lane_u64 to minimize vector to general purpose register
+ // transfers and then mask off the bits we actually want.
+ const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0);
+ const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
+ const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
+ const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
+ const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
+
+ // At time of writing both Clang and GCC produced better code with these
+ // nested if-statements compared to a switch statement with fallthrough.
+ ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+ if (n > 1) {
+ ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+ if (n > 2) {
+ ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx2), ret1_u32, 0);
+ if (n > 3) {
+ ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx3), ret1_u32, 1);
+ }
+ }
+ }
+ return vuzp_u16(vreinterpret_u16_u32(ret0_u32),
+ vreinterpret_u16_u32(ret1_u32));
+}
+
+static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8(
+ const uint16_t *left, const int16x8_t indices, int n) {
+ assert(n > 0);
+ assert(n <= 8);
+ // Load two elements at a time and then uzp them into separate vectors, to
+ // reduce the number of memory accesses.
+ uint32x4_t ret0_u32 = vdupq_n_u32(0);
+ uint32x4_t ret1_u32 = vdupq_n_u32(0);
+
+ // Use a pair of vget_lane_u64 to minimize vector to general purpose register
+ // transfers and then mask off the bits we actually want.
+ const uint64_t indices0123 =
+ vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0);
+ const uint64_t indices4567 =
+ vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1);
+ const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
+ const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
+ const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
+ const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
+ const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU);
+ const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU);
+ const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU);
+ const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU);
+
+ // At time of writing both Clang and GCC produced better code with these
+ // nested if-statements compared to a switch statement with fallthrough.
+ ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+ if (n > 1) {
+ ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+ if (n > 2) {
+ ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx2), ret0_u32, 2);
+ if (n > 3) {
+ ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx3), ret0_u32, 3);
+ if (n > 4) {
+ ret1_u32 =
+ vld1q_lane_u32((const uint32_t *)(left + idx4), ret1_u32, 0);
+ if (n > 5) {
+ ret1_u32 =
+ vld1q_lane_u32((const uint32_t *)(left + idx5), ret1_u32, 1);
+ if (n > 6) {
+ ret1_u32 =
+ vld1q_lane_u32((const uint32_t *)(left + idx6), ret1_u32, 2);
+ if (n > 7) {
+ ret1_u32 = vld1q_lane_u32((const uint32_t *)(left + idx7),
+ ret1_u32, 3);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32),
+ vreinterpretq_u16_u32(ret1_u32));
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4(
+ uint16x4_t out_x, uint16x4_t out_y, int base_shift) {
+ assert(base_shift >= 0);
+ assert(base_shift <= 4);
+ // On AArch64 we can permute the data from the `above` and `left` vectors
+ // into a single vector in a single load (of the permute vector) + tbl.
+#if AOM_ARCH_AARCH64
+ const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y),
+ vreinterpret_u8_u16(out_x) } };
+ return vreinterpret_u16_u8(
+ vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift])));
+#else
+ uint16x4_t out = out_y;
+ for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) {
+ out[c2] = out_x[x_idx];
+ }
+ return out;
+#endif
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8(
+ uint16x8_t out_x, uint16x8_t out_y, int base_shift) {
+ assert(base_shift >= 0);
+ assert(base_shift <= 8);
+ // On AArch64 we can permute the data from the `above` and `left` vectors
+ // into a single vector in a single load (of the permute vector) + tbl.
+#if AOM_ARCH_AARCH64
+ const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y),
+ vreinterpretq_u8_u16(out_x) } };
+ return vreinterpretq_u16_u8(
+ vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift])));
+#else
+ uint16x8_t out = out_y;
+ for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) {
+ out[c2] = out_x[x_idx];
+ }
+ return out;
+#endif
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4(
+ uint16x4_t a0, uint16x4_t a1, int16x4_t shift) {
+ uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift));
+ res =
+ vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift)));
+ return vrshrn_n_u32(res, 5);
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8(
+ uint16x8_t a0, uint16x8_t a1, int16x8_t shift) {
+ return vcombine_u16(
+ highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1),
+ vget_low_s16(shift)),
+ highbd_dr_prediction_z2_apply_shift_x4(
+ vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift)));
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4(
+ const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1,
+ const uint16_t *left, int dx, int dy, int r, int c) {
+ const int16x4_t iota = vld1_s16(iota1_s16);
+
+ const int x0 = (c << 6) - (r + 1) * dx;
+ const int y0 = (r << 6) - (c + 1) * dy;
+
+ const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6));
+ const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy));
+ const int16x4_t shift_x0123 =
+ vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
+ const int16x4_t shift_y0123 =
+ vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1);
+ const int16x4_t base_y0123 = vshr_n_s16(y0123, 6);
+
+ const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
+
+ // Based on the value of `base_shift` there are three possible cases to
+ // compute the result:
+ // 1) base_shift <= 0: We can load and operate entirely on data from the
+ // `above` input vector.
+ // 2) base_shift < vl: We can load from `above[-1]` and shift
+ // `vl - base_shift` elements across to the end of the
+ // vector, then compute the remainder from `left`.
+ // 3) base_shift >= vl: We can load and operate entirely on data from the
+ // `left` input vector.
+
+ if (base_shift <= 0) {
+ const int base_x = x0 >> 6;
+ const uint16x4_t a0 = vld1_u16(above + base_x);
+ const uint16x4_t a1 = vld1_u16(above + base_x + 1);
+ return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
+ } else if (base_shift < 4) {
+ const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(
+ left + 1, base_y0123, base_shift);
+ const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4(
+ l01.val[0], l01.val[1], shift_y0123);
+
+ // No need to reload from above in the loop, just use pre-loaded constants.
+ const uint16x4_t out16_x =
+ highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123);
+
+ return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift);
+ } else {
+ const uint16x4x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4);
+ return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1],
+ shift_y0123);
+ }
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8(
+ const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1,
+ const uint16_t *left, int dx, int dy, int r, int c) {
+ const int16x8_t iota = vld1q_s16(iota1_s16);
+
+ const int x0 = (c << 6) - (r + 1) * dx;
+ const int y0 = (r << 6) - (c + 1) * dy;
+
+ const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6));
+ const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy));
+ const int16x8_t shift_x01234567 =
+ vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1);
+ const int16x8_t shift_y01234567 =
+ vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1);
+ const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6);
+
+ const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
+
+ // Based on the value of `base_shift` there are three possible cases to
+ // compute the result:
+ // 1) base_shift <= 0: We can load and operate entirely on data from the
+ // `above` input vector.
+ // 2) base_shift < vl: We can load from `above[-1]` and shift
+ // `vl - base_shift` elements across to the end of the
+ // vector, then compute the remainder from `left`.
+ // 3) base_shift >= vl: We can load and operate entirely on data from the
+ // `left` input vector.
+
+ if (base_shift <= 0) {
+ const int base_x = x0 >> 6;
+ const uint16x8_t a0 = vld1q_u16(above + base_x);
+ const uint16x8_t a1 = vld1q_u16(above + base_x + 1);
+ return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
+ } else if (base_shift < 8) {
+ const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(
+ left + 1, base_y01234567, base_shift);
+ const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8(
+ l01.val[0], l01.val[1], shift_y01234567);
+
+ // No need to reload from above in the loop, just use pre-loaded constants.
+ const uint16x8_t out16_x =
+ highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567);
+
+ return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift);
+ } else {
+ const uint16x8x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8);
+ return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1],
+ shift_y01234567);
+ }
+}
+
+// Left array is accessed from -1 through `bh - 1` inclusive.
+// Above array is accessed from -1 through `bw - 1` inclusive.
+#define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh) \
+ static void highbd_dr_prediction_z2_##bw##x##bh##_neon( \
+ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+ const uint16_t *left, int upsample_above, int upsample_left, int dx, \
+ int dy, int bd) { \
+ (void)bd; \
+ (void)upsample_above; \
+ (void)upsample_left; \
+ assert(!upsample_above); \
+ assert(!upsample_left); \
+ assert(bw % 4 == 0); \
+ assert(bh % 4 == 0); \
+ assert(dx > 0); \
+ assert(dy > 0); \
+ \
+ uint16_t left_data[bh + 1]; \
+ memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t)); \
+ \
+ uint16x8_t a0, a1; \
+ if (bw == 4) { \
+ a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0)); \
+ a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0)); \
+ } else { \
+ a0 = vld1q_u16(above - 1); \
+ a1 = vld1q_u16(above + 0); \
+ } \
+ \
+ int r = 0; \
+ do { \
+ if (bw == 4) { \
+ vst1_u16(dst, highbd_dr_prediction_z2_step_x4( \
+ above, vget_low_u16(a0), vget_low_u16(a1), \
+ left_data, dx, dy, r, 0)); \
+ } else { \
+ int c = 0; \
+ do { \
+ vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8( \
+ above, a0, a1, left_data, dx, dy, r, c)); \
+ c += 8; \
+ } while (c < bw); \
+ } \
+ dst += stride; \
+ } while (++r < bh); \
+ }
+
+HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
+HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
+
+#undef HIGHBD_DR_PREDICTOR_Z2_WXH
+
+typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left,
+ int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd);
+
+static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left,
+ int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+ const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+ // if `upsample_left` then we need -2 through 6 inclusive from `left`.
+ // else we only need -1 through 3 inclusive.
+
+#if AOM_ARCH_AARCH64
+ uint16x8_t left_data0, left_data1;
+ if (upsample_left) {
+ left_data0 = vld1q_u16(left - 2);
+ left_data1 = vld1q_u16(left - 1);
+ } else {
+ left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
+ left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
+ }
+#endif
+
+ const int16x4_t iota0123 = vld1_s16(iota1_s16);
+ const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
+
+ for (int r = 0; r < 4; ++r) {
+ const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+ const int x0 = (r + 1) * dx;
+ const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
+ const int base_x0 = (-x0) >> frac_bits_x;
+ if (base_shift <= 0) {
+ uint16x4_t a0, a1;
+ int16x4_t shift_x0123;
+ if (upsample_above) {
+ const uint16x4x2_t a01 = vld2_u16(above + base_x0);
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
+ } else {
+ a0 = vld1_u16(above + base_x0);
+ a1 = vld1_u16(above + base_x0 + 1);
+ shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
+ }
+ vst1_u16(dst,
+ highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
+ } else if (base_shift < 4) {
+ // Calculate Y component from `left`.
+ const int y_iters = base_shift;
+ const int16x4_t y0123 =
+ vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+ const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+ const int16x4_t shift_y0123 = vshr_n_s16(
+ vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+ uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
+ left_data_base, y_iters);
+ l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
+ left_data_base, y_iters);
+#else
+ const uint16x4x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ const uint16x4_t out_y =
+ highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
+
+ // Calculate X component from `above`.
+ const int16x4_t shift_x0123 = vshr_n_s16(
+ vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)),
+ 1);
+ uint16x4_t a0, a1;
+ if (upsample_above) {
+ const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ } else {
+ a0 = vld1_u16(above - 1);
+ a1 = vld1_u16(above + 0);
+ }
+ const uint16x4_t out_x =
+ highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
+
+ // Combine X and Y vectors.
+ const uint16x4_t out =
+ highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
+ vst1_u16(dst, out);
+ } else {
+ const int16x4_t y0123 =
+ vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+ const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+ const int16x4_t shift_y0123 = vshr_n_s16(
+ vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+ uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
+ left_data_base, 4);
+ l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
+ left_data_base, 4);
+#else
+ const uint16x4x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+ vst1_u16(dst,
+ highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
+ }
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left,
+ int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+ const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+ // if `upsample_left` then we need -2 through 14 inclusive from `left`.
+ // else we only need -1 through 6 inclusive.
+
+#if AOM_ARCH_AARCH64
+ uint16x8x2_t left_data0, left_data1;
+ if (upsample_left) {
+ left_data0 = vld1q_u16_x2(left - 2);
+ left_data1 = vld1q_u16_x2(left - 1);
+ } else {
+ left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
+ left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
+ }
+#endif
+
+ const int16x4_t iota0123 = vld1_s16(iota1_s16);
+ const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
+
+ for (int r = 0; r < 8; ++r) {
+ const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+ const int x0 = (r + 1) * dx;
+ const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
+ const int base_x0 = (-x0) >> frac_bits_x;
+ if (base_shift <= 0) {
+ uint16x4_t a0, a1;
+ int16x4_t shift_x0123;
+ if (upsample_above) {
+ const uint16x4x2_t a01 = vld2_u16(above + base_x0);
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
+ } else {
+ a0 = vld1_u16(above + base_x0);
+ a1 = vld1_u16(above + base_x0 + 1);
+ shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
+ }
+ vst1_u16(dst,
+ highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
+ } else if (base_shift < 4) {
+ // Calculate Y component from `left`.
+ const int y_iters = base_shift;
+ const int16x4_t y0123 =
+ vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+ const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+ const int16x4_t shift_y0123 = vshr_n_s16(
+ vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+
+ uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
+ left_data0, base_y0123, left_data_base, y_iters);
+ l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
+ left_data1, base_y0123, left_data_base, y_iters);
+#else
+ const uint16x4x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ const uint16x4_t out_y =
+ highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
+
+ // Calculate X component from `above`.
+ uint16x4_t a0, a1;
+ int16x4_t shift_x0123;
+ if (upsample_above) {
+ const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
+ } else {
+ a0 = vld1_u16(above - 1);
+ a1 = vld1_u16(above + 0);
+ shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
+ }
+ const uint16x4_t out_x =
+ highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
+
+ // Combine X and Y vectors.
+ const uint16x4_t out =
+ highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
+ vst1_u16(dst, out);
+ } else {
+ const int16x4_t y0123 =
+ vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+ const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+ const int16x4_t shift_y0123 = vshr_n_s16(
+ vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+
+ uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123,
+ left_data_base, 4);
+ l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123,
+ left_data_base, 4);
+#else
+ const uint16x4x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ vst1_u16(dst,
+ highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
+ }
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left,
+ int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+ const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+ // if `upsample_left` then we need -2 through 6 inclusive from `left`.
+ // else we only need -1 through 3 inclusive.
+
+#if AOM_ARCH_AARCH64
+ uint16x8_t left_data0, left_data1;
+ if (upsample_left) {
+ left_data0 = vld1q_u16(left - 2);
+ left_data1 = vld1q_u16(left - 1);
+ } else {
+ left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
+ left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
+ }
+#endif
+
+ const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
+ const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
+
+ for (int r = 0; r < 4; ++r) {
+ const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+ const int x0 = (r + 1) * dx;
+ const int16x8_t x01234567 =
+ vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
+ const int base_x0 = (-x0) >> frac_bits_x;
+ if (base_shift <= 0) {
+ uint16x8_t a0, a1;
+ int16x8_t shift_x01234567;
+ if (upsample_above) {
+ const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+ } else {
+ a0 = vld1q_u16(above + base_x0);
+ a1 = vld1q_u16(above + base_x0 + 1);
+ shift_x01234567 =
+ vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+ }
+ vst1q_u16(
+ dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
+ } else if (base_shift < 8) {
+ // Calculate Y component from `left`.
+ const int y_iters = base_shift;
+ const int16x8_t y01234567 =
+ vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+ const int16x8_t base_y01234567 =
+ vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+ const int16x8_t shift_y01234567 =
+ vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+ vdupq_n_s16(0x3F)),
+ 1);
+
+ uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+ left_data0, base_y01234567, left_data_base, y_iters);
+ l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+ left_data1, base_y01234567, left_data_base, y_iters);
+#else
+ const uint16x8x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ const uint16x8_t out_y =
+ highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
+
+ // Calculate X component from `above`.
+ uint16x8_t a0, a1;
+ int16x8_t shift_x01234567;
+ if (upsample_above) {
+ const uint16x8x2_t a01 =
+ vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+ } else {
+ a0 = vld1q_u16(above - 1);
+ a1 = vld1q_u16(above + 0);
+ shift_x01234567 =
+ vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+ }
+ const uint16x8_t out_x =
+ highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
+
+ // Combine X and Y vectors.
+ const uint16x8_t out =
+ highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
+ vst1q_u16(dst, out);
+ } else {
+ const int16x8_t y01234567 =
+ vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+ const int16x8_t base_y01234567 =
+ vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+ const int16x8_t shift_y01234567 =
+ vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+ vdupq_n_s16(0x3F)),
+ 1);
+
+ uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+ left_data0, base_y01234567, left_data_base, 8);
+ l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+ left_data1, base_y01234567, left_data_base, 8);
+#else
+ const uint16x8x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ vst1q_u16(
+ dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
+ }
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left,
+ int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+ const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+ // if `upsample_left` then we need -2 through 14 inclusive from `left`.
+ // else we only need -1 through 6 inclusive.
+
+#if AOM_ARCH_AARCH64
+ uint16x8x2_t left_data0, left_data1;
+ if (upsample_left) {
+ left_data0 = vld1q_u16_x2(left - 2);
+ left_data1 = vld1q_u16_x2(left - 1);
+ } else {
+ left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
+ left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
+ }
+#endif
+
+ const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
+ const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
+
+ for (int r = 0; r < 8; ++r) {
+ const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+ const int x0 = (r + 1) * dx;
+ const int16x8_t x01234567 =
+ vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
+ const int base_x0 = (-x0) >> frac_bits_x;
+ if (base_shift <= 0) {
+ uint16x8_t a0, a1;
+ int16x8_t shift_x01234567;
+ if (upsample_above) {
+ const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+ } else {
+ a0 = vld1q_u16(above + base_x0);
+ a1 = vld1q_u16(above + base_x0 + 1);
+ shift_x01234567 =
+ vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+ }
+ vst1q_u16(
+ dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
+ } else if (base_shift < 8) {
+ // Calculate Y component from `left`.
+ const int y_iters = base_shift;
+ const int16x8_t y01234567 =
+ vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+ const int16x8_t base_y01234567 =
+ vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+ const int16x8_t shift_y01234567 =
+ vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+ vdupq_n_s16(0x3F)),
+ 1);
+
+ uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+ left_data0, base_y01234567, left_data_base, y_iters);
+ l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+ left_data1, base_y01234567, left_data_base, y_iters);
+#else
+ const uint16x8x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ const uint16x8_t out_y =
+ highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
+
+ // Calculate X component from `above`.
+ uint16x8_t a0, a1;
+ int16x8_t shift_x01234567;
+ if (upsample_above) {
+ const uint16x8x2_t a01 =
+ vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+ a0 = a01.val[0];
+ a1 = a01.val[1];
+ shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+ } else {
+ a0 = vld1q_u16(above - 1);
+ a1 = vld1q_u16(above + 0);
+ shift_x01234567 =
+ vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+ }
+ const uint16x8_t out_x =
+ highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
+
+ // Combine X and Y vectors.
+ const uint16x8_t out =
+ highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
+ vst1q_u16(dst, out);
+ } else {
+ const int16x8_t y01234567 =
+ vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+ const int16x8_t base_y01234567 =
+ vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+ const int16x8_t shift_y01234567 =
+ vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+ vdupq_n_s16(0x3F)),
+ 1);
+
+ uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+ const int left_data_base = upsample_left ? -2 : -1;
+ l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+ left_data0, base_y01234567, left_data_base, 8);
+ l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+ left_data1, base_y01234567, left_data_base, 8);
+#else
+ const uint16x8x2_t l01 =
+ highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
+ l0 = l01.val[0];
+ l1 = l01.val[1];
+#endif
+
+ vst1q_u16(
+ dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
+ }
+ dst += stride;
+ }
+}
+
+static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+ { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
+ &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL,
+ NULL },
+ { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
+ &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon,
+ &highbd_dr_prediction_z2_8x32_neon, NULL },
+ { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon,
+ &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon,
+ &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon },
+ { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon,
+ &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon,
+ &highbd_dr_prediction_z2_32x64_neon },
+ { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon,
+ &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon },
+};
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ highbd_dr_prediction_z2_ptr f =
+ dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)];
+ assert(f != NULL);
+ f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
+}
+
+// -----------------------------------------------------------------------------
+// Z3
+
+// Both the lane to the use and the shift amount must be immediates.
+#define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \
+ lane, shift) \
+ do { \
+ uint32x4_t val = vmull_lane_u16((in0), (s0), (lane)); \
+ val = vmlal_lane_u16(val, (in1), (s1), (lane)); \
+ const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base)); \
+ const uint16x4_t res = vrshrn_n_u32(val, (shift)); \
+ *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res, \
+ vdup_n_u16(left_max)); \
+ } while (0)
+
+#define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \
+ lane, shift) \
+ do { \
+ uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane)); \
+ val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane)); \
+ uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
+ val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane)); \
+ const uint16x8_t cmp = vaddq_u16((iota), vdupq_n_u16(base)); \
+ const uint16x8_t res = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \
+ vrshrn_n_u32(val_hi, (shift))); \
+ *(out) = vbslq_u16(vcltq_u16(cmp, vdupq_n_u16(max_base_y)), res, \
+ vdupq_n_u16(left_max)); \
+ } while (0)
+
+static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
+ ptrdiff_t stride, int bw,
+ int bh, const uint16_t *left,
+ int dy) {
+ assert(bw % 4 == 0);
+ assert(bh % 4 == 0);
+ assert(dy > 0);
+
+ // Factor out left + 1 to give the compiler a better chance of recognising
+ // that the offsets used for the loads from left and left + 1 are otherwise
+ // identical.
+ const uint16_t *left1 = left + 1;
+
+ const int max_base_y = (bw + bh - 1);
+ const int left_max = left[max_base_y];
+ const int frac_bits = 6;
+
+ const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16));
+ const uint16x4_t iota1x4 = vget_low_u16(iota1x8);
+
+ // The C implementation of the z3 predictor when not upsampling uses:
+ // ((y & 0x3f) >> 1)
+ // The right shift is unnecessary here since we instead shift by +1 later,
+ // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
+ const uint16x4_t shift_mask = vdup_n_u16(0x3e);
+
+ if (bh == 4) {
+ int y = dy;
+ int c = 0;
+ do {
+ // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
+ // multiply instructions.
+ const uint16x4_t shifts1 =
+ vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+ const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
+ const int base0 = (y + 0 * dy) >> frac_bits;
+ const int base1 = (y + 1 * dy) >> frac_bits;
+ const int base2 = (y + 2 * dy) >> frac_bits;
+ const int base3 = (y + 3 * dy) >> frac_bits;
+ uint16x4_t out[4];
+ if (base0 >= max_base_y) {
+ out[0] = vdup_n_u16(left_max);
+ } else {
+ const uint16x4_t l00 = vld1_u16(left + base0);
+ const uint16x4_t l01 = vld1_u16(left1 + base0);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01,
+ shifts0, shifts1, 0, 6);
+ }
+ if (base1 >= max_base_y) {
+ out[1] = vdup_n_u16(left_max);
+ } else {
+ const uint16x4_t l10 = vld1_u16(left + base1);
+ const uint16x4_t l11 = vld1_u16(left1 + base1);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11,
+ shifts0, shifts1, 1, 6);
+ }
+ if (base2 >= max_base_y) {
+ out[2] = vdup_n_u16(left_max);
+ } else {
+ const uint16x4_t l20 = vld1_u16(left + base2);
+ const uint16x4_t l21 = vld1_u16(left1 + base2);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21,
+ shifts0, shifts1, 2, 6);
+ }
+ if (base3 >= max_base_y) {
+ out[3] = vdup_n_u16(left_max);
+ } else {
+ const uint16x4_t l30 = vld1_u16(left + base3);
+ const uint16x4_t l31 = vld1_u16(left1 + base3);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31,
+ shifts0, shifts1, 3, 6);
+ }
+ transpose_array_inplace_u16_4x4(out);
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + r2 * stride + c, out[r2]);
+ }
+ y += 4 * dy;
+ c += 4;
+ } while (c < bw);
+ } else {
+ int y = dy;
+ int c = 0;
+ do {
+ int r = 0;
+ do {
+ // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
+ // multiply instructions.
+ const uint16x4_t shifts1 =
+ vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+ const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
+ const int base0 = ((y + 0 * dy) >> frac_bits) + r;
+ const int base1 = ((y + 1 * dy) >> frac_bits) + r;
+ const int base2 = ((y + 2 * dy) >> frac_bits) + r;
+ const int base3 = ((y + 3 * dy) >> frac_bits) + r;
+ uint16x8_t out[4];
+ if (base0 >= max_base_y) {
+ out[0] = vdupq_n_u16(left_max);
+ } else {
+ const uint16x8_t l00 = vld1q_u16(left + base0);
+ const uint16x8_t l01 = vld1q_u16(left1 + base0);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l00, l01,
+ shifts0, shifts1, 0, 6);
+ }
+ if (base1 >= max_base_y) {
+ out[1] = vdupq_n_u16(left_max);
+ } else {
+ const uint16x8_t l10 = vld1q_u16(left + base1);
+ const uint16x8_t l11 = vld1q_u16(left1 + base1);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l10, l11,
+ shifts0, shifts1, 1, 6);
+ }
+ if (base2 >= max_base_y) {
+ out[2] = vdupq_n_u16(left_max);
+ } else {
+ const uint16x8_t l20 = vld1q_u16(left + base2);
+ const uint16x8_t l21 = vld1q_u16(left1 + base2);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l20, l21,
+ shifts0, shifts1, 2, 6);
+ }
+ if (base3 >= max_base_y) {
+ out[3] = vdupq_n_u16(left_max);
+ } else {
+ const uint16x8_t l30 = vld1q_u16(left + base3);
+ const uint16x8_t l31 = vld1q_u16(left1 + base3);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l30, l31,
+ shifts0, shifts1, 3, 6);
+ }
+ transpose_array_inplace_u16_4x8(out);
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
+ }
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
+ }
+ r += 8;
+ } while (r < bh);
+ y += 4 * dy;
+ c += 4;
+ } while (c < bw);
+ }
+}
+
+static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst,
+ ptrdiff_t stride, int bw,
+ int bh, const uint16_t *left,
+ int dy) {
+ assert(bw % 4 == 0);
+ assert(bh % 4 == 0);
+ assert(dy > 0);
+
+ const int max_base_y = (bw + bh - 1) << 1;
+ const int left_max = left[max_base_y];
+ const int frac_bits = 5;
+
+ const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16));
+ const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16));
+ const uint16x4_t iota2x4 = vget_low_u16(iota2x8);
+
+ // The C implementation of the z3 predictor when upsampling uses:
+ // (((x << 1) & 0x3f) >> 1)
+ // The two shifts are unnecessary here since the lowest bit is guaranteed to
+ // be zero when the mask is applied, so adjust the mask to 0x1f to avoid
+ // needing the shifts at all.
+ const uint16x4_t shift_mask = vdup_n_u16(0x1F);
+
+ if (bh == 4) {
+ int y = dy;
+ int c = 0;
+ do {
+ // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
+ // multiply instructions.
+ const uint16x4_t shifts1 =
+ vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+ const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
+ const int base0 = (y + 0 * dy) >> frac_bits;
+ const int base1 = (y + 1 * dy) >> frac_bits;
+ const int base2 = (y + 2 * dy) >> frac_bits;
+ const int base3 = (y + 3 * dy) >> frac_bits;
+ const uint16x4x2_t l0 = vld2_u16(left + base0);
+ const uint16x4x2_t l1 = vld2_u16(left + base1);
+ const uint16x4x2_t l2 = vld2_u16(left + base2);
+ const uint16x4x2_t l3 = vld2_u16(left + base3);
+ uint16x4_t out[4];
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0],
+ l0.val[1], shifts0, shifts1, 0, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0],
+ l1.val[1], shifts0, shifts1, 1, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0],
+ l2.val[1], shifts0, shifts1, 2, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0],
+ l3.val[1], shifts0, shifts1, 3, 5);
+ transpose_array_inplace_u16_4x4(out);
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + r2 * stride + c, out[r2]);
+ }
+ y += 4 * dy;
+ c += 4;
+ } while (c < bw);
+ } else {
+ assert(bh % 8 == 0);
+
+ int y = dy;
+ int c = 0;
+ do {
+ int r = 0;
+ do {
+ // Fully unroll the 4x8 block to allow us to use immediate lane-indexed
+ // multiply instructions.
+ const uint16x4_t shifts1 =
+ vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+ const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
+ const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2);
+ const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2);
+ const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2);
+ const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2);
+ const uint16x8x2_t l0 = vld2q_u16(left + base0);
+ const uint16x8x2_t l1 = vld2q_u16(left + base1);
+ const uint16x8x2_t l2 = vld2q_u16(left + base2);
+ const uint16x8x2_t l3 = vld2q_u16(left + base3);
+ uint16x8_t out[4];
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0],
+ l0.val[1], shifts0, shifts1, 0, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0],
+ l1.val[1], shifts0, shifts1, 1, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0],
+ l2.val[1], shifts0, shifts1, 2, 5);
+ HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0],
+ l3.val[1], shifts0, shifts1, 3, 5);
+ transpose_array_inplace_u16_4x8(out);
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
+ }
+ for (int r2 = 0; r2 < 4; ++r2) {
+ vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
+ }
+ r += 8;
+ } while (r < bh);
+ y += 4 * dy;
+ c += 4;
+ } while (c < bw);
+ }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_left,
+ int dx, int dy, int bd) {
+ (void)above;
+ (void)dx;
+ (void)bd;
+ assert(bw % 4 == 0);
+ assert(bh % 4 == 0);
+ assert(dx == 1);
+ assert(dy > 0);
+
+ if (upsample_left) {
+ highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy);
+ } else {
+ highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy);
+ }
+}
+
+#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4
+#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8
diff --git a/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
new file mode 100644
index 0000000000..77727b7665
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -0,0 +1,1265 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low,
+ const int16x4_t high) {
+ return vmin_s16(vmax_s16(val, low), high);
+}
+
+static INLINE uint16x8_t convert_to_unsigned_pixel_u16(int16x8_t val,
+ int bitdepth) {
+ const int16x8_t low = vdupq_n_s16(0);
+ const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
+
+ return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
+}
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+static INLINE uint16x4_t hev(const uint16x8_t abd_p0p1_q0q1,
+ const uint16_t thresh) {
+ const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+ return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+static INLINE uint16x4_t outer_threshold(const uint16x4_t p1,
+ const uint16x4_t p0,
+ const uint16x4_t q0,
+ const uint16x4_t q1,
+ const uint16_t outer_thresh) {
+ const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+ const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+ const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+ const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+ const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+ return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// outer_threshold()
+static INLINE uint16x4_t needs_filter4(const uint16x8_t abd_p0p1_q0q1,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+// outer_threshold()
+static INLINE uint16x4_t needs_filter6(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p1p2_q1q2,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+// outer_threshold()
+static INLINE uint16x4_t needs_filter8(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p1p2_q1q2,
+ const uint16x8_t abd_p2p3_q2q3,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+ const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// filterN_masks functions.
+
+static INLINE void filter4_masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+ const uint16_t hev_thresh,
+ const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t *const hev_mask,
+ uint16x4_t *const needs_filter4_mask) {
+ const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ // This includes cases where needs_filter4() is not true and so filter2() will
+ // not be applied.
+ const uint16x4_t hev_tmp_mask = hev(p0p1_q0q1, hev_thresh);
+
+ *needs_filter4_mask = needs_filter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+ // filter2() will only be applied if both needs_filter4() and hev() are true.
+ *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+static INLINE uint16x4_t is_flat3(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p0p2_q0q2,
+ const int bitdepth) {
+ const int flat_thresh = 1 << (bitdepth - 8);
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+ const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+ return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+static INLINE void filter6_masks(
+ const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0,
+ const uint16_t hev_thresh, const uint16x4_t outer_mask,
+ const uint16_t inner_thresh, const int bitdepth,
+ uint16x4_t *const needs_filter6_mask, uint16x4_t *const is_flat3_mask,
+ uint16x4_t *const hev_mask) {
+ const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ *hev_mask = hev(abd_p0p1_q0q1, hev_thresh);
+ *is_flat3_mask = is_flat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), bitdepth);
+ *needs_filter6_mask = needs_filter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+ inner_thresh, outer_mask);
+}
+
+// is_flat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+static INLINE uint16x4_t is_flat4(const uint16x8_t abd_pnp0_qnq0,
+ const uint16x8_t abd_pn1p0_qn1q0,
+ const uint16x8_t abd_pn2p0_qn2q0,
+ const int bitdepth) {
+ const int flat_thresh = 1 << (bitdepth - 8);
+ const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+ const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+ const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+ return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+static INLINE void filter8_masks(
+ const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, const uint16_t hev_thresh,
+ const uint16x4_t outer_mask, const uint16_t inner_thresh,
+ const int bitdepth, uint16x4_t *const needs_filter8_mask,
+ uint16x4_t *const is_flat4_mask, uint16x4_t *const hev_mask) {
+ const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ *hev_mask = hev(abd_p0p1_q0q1, hev_thresh);
+ const uint16x4_t v_is_flat4 = is_flat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2),
+ vabdq_u16(p0q0, p3q3), bitdepth);
+ *needs_filter8_mask =
+ needs_filter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+ inner_thresh, outer_mask);
+ // |is_flat4_mask| is used to decide where to use the result of filter8.
+ // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+ // overriding the question of whether to use filter8. Because filter4 doesn't
+ // apply to p2q2, |is_flat4_mask| chooses directly between filter8 and the
+ // source value. To be correct, the mask must account for this override.
+ *is_flat4_mask = vand_u16(v_is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// filterN functions.
+
+// Calculate filter4() or filter2() based on |hev_mask|.
+static INLINE void filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+ const uint16x8_t p1q1, const uint16x4_t hev_mask,
+ int bitdepth, uint16x8_t *const p1q1_result,
+ uint16x8_t *const p0q0_result) {
+ const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+ // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+ // q0mp0 means "q0 minus p0".
+ const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+ const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+ // If this is for filter2() then include |p1mq1|. Otherwise zero it.
+ const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (bitdepth - 1)));
+ const int16x4_t max_signed_pixel = vdup_n_s16((1 << (bitdepth - 1)) - 1);
+ const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+ const int16x4_t p1mq1_saturated =
+ clip3_s16(p1mq1, min_signed_pixel, max_signed_pixel);
+ const int16x4_t hev_option =
+ vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+ const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+ // Need to figure out what's going on here because there are some unnecessary
+ // tricks to accommodate 8x8 as smallest 8bpp vector
+
+ // We can not shift with rounding because the clamp comes *before* the
+ // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+ // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int16x4_t plus_four =
+ clip3_s16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+ const int16x4_t plus_three =
+ clip3_s16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+ const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+ const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+ // a3 = (a1 + 1) >> 1;
+ const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+ const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+ const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+ // Need to shift the second term or we end up with a2_ma2.
+ const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+ const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+ *p1q1_result = convert_to_unsigned_pixel_u16(p1q1_a3, bitdepth);
+ *p0q0_result = convert_to_unsigned_pixel_u16(p0q0_a, bitdepth);
+}
+
+void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst_p1 = (uint16_t *)(s - 2 * pitch);
+ uint16_t *const dst_p0 = (uint16_t *)(s - pitch);
+ uint16_t *const dst_q0 = (uint16_t *)(s);
+ uint16_t *const dst_q1 = (uint16_t *)(s + pitch);
+
+ const uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0),
+ vld1_u16(dst_q0), vld1_u16(dst_q1) };
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[0], src[1], src[2], src[3], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+ const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+ filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter4_mask_8 =
+ vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+ uint16x8_t f_p1q1;
+ uint16x8_t f_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0);
+
+ // Already integrated the hev mask when calculating the filtered values.
+ const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+ const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void aom_highbd_lpf_horizontal_4_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ // Offset by 2 uint16_t values to load from first p1 position.
+ uint16_t *dst = s - 2;
+ uint16_t *dst_p1 = dst;
+ uint16_t *dst_p0 = dst + pitch;
+ uint16_t *dst_q0 = dst + pitch * 2;
+ uint16_t *dst_q1 = dst + pitch * 3;
+
+ uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1) };
+ transpose_array_inplace_u16_4x4(src);
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[0], src[1], src[2], src[3], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+ const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+ filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter4_mask_8 =
+ vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+ uint16x8_t f_p1q1;
+ uint16x8_t f_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0);
+
+ // Already integrated the hev mask when calculating the filtered values.
+ const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+ const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+ uint16x4_t output[4] = {
+ vget_low_u16(p1q1_output),
+ vget_low_u16(p0q0_output),
+ vget_high_u16(p0q0_output),
+ vget_high_u16(p1q1_output),
+ };
+ transpose_array_inplace_u16_4x4(output);
+
+ vst1_u16(dst_p1, output[0]);
+ vst1_u16(dst_p0, output[1]);
+ vst1_u16(dst_q0, output[2]);
+ vst1_u16(dst_q1, output[3]);
+}
+
+void aom_highbd_lpf_vertical_4_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, uint16x8_t *const p1q1_output,
+ uint16x8_t *const p0q0_output) {
+ // Sum p1 and q1 output from opposite directions.
+ // The formula is regrouped to allow 3 doubling operations to be combined.
+ //
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+ // ^^^^^^^^
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^^^^^^
+ uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^
+ sum = vaddq_u16(sum, p0q0);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^
+ sum = vshlq_n_u16(sum, 1);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^ ^^^^^^
+ // Should dual issue with the left shift.
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
+ const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+ sum = vaddq_u16(sum, outer_sum);
+
+ *p1q1_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - (2 * p2) + q0 + q1
+ // q0 = q1 - (2 * q2) + p0 + p1
+ // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+ // ^^^^^^^^
+ const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+ // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+ // ^^^^^^^^
+ sum = vsubq_u16(sum, p2q2_double);
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
+ sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+ *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst_p2 = s - 3 * pitch;
+ uint16_t *const dst_p1 = s - 2 * pitch;
+ uint16_t *const dst_p0 = s - pitch;
+ uint16_t *const dst_q0 = s;
+ uint16_t *const dst_q1 = s + pitch;
+ uint16_t *const dst_q2 = s + 2 * pitch;
+
+ const uint16x4_t src[6] = { vld1_u16(dst_p2), vld1_u16(dst_p1),
+ vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1), vld1_u16(dst_q2) };
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[1], src[2], src[3], src[4], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat3_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+ const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+ const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+ filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
+ &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
+ // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
+ // output is not used.
+ uint16x8_t f6_p1q1, f6_p0q0;
+ const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+ if (vget_lane_u64(need_filter6, 0) == 0) {
+ // filter6() does not apply, but filter4() applies to one or more values.
+ p0q0_output = p0q0;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+ p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ // Left side of the filter window.
+ uint16_t *const dst = s - 3;
+ uint16_t *const dst_0 = dst;
+ uint16_t *const dst_1 = dst + pitch;
+ uint16_t *const dst_2 = dst + 2 * pitch;
+ uint16_t *const dst_3 = dst + 3 * pitch;
+
+ // Overread by 2 values. These overreads become the high halves of src_raw[2]
+ // and src_raw[3] after transpose.
+ uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
+ vld1q_u16(dst_2), vld1q_u16(dst_3) };
+ transpose_array_inplace_u16_4x8(src_raw);
+ // p2, p1, p0, q0, q1, q2
+ const uint16x4_t src[6] = {
+ vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]),
+ vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]),
+ vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+ };
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[1], src[2], src[3], src[4], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat3_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+ const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+ const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+ filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
+ &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
+ // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
+ // output is not used.
+ uint16x8_t f6_p1q1, f6_p0q0;
+ const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+ if (vget_lane_u64(need_filter6, 0) == 0) {
+ // filter6() does not apply, but filter4() applies to one or more values.
+ p0q0_output = p0q0;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+ p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ uint16x4_t output[4] = {
+ vget_low_u16(p1q1_output),
+ vget_low_u16(p0q0_output),
+ vget_high_u16(p0q0_output),
+ vget_high_u16(p1q1_output),
+ };
+ transpose_array_inplace_u16_4x4(output);
+
+ // dst_n starts at p2, so adjust to p1.
+ vst1_u16(dst_0 + 1, output[0]);
+ vst1_u16(dst_1 + 1, output[1]);
+ vst1_u16(dst_2 + 1, output[2]);
+ vst1_u16(dst_3 + 1, output[3]);
+}
+
+void aom_highbd_lpf_vertical_6_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+ const uint16x8_t p1q1, const uint16x8_t p0q0,
+ uint16x8_t *const p2q2_output,
+ uint16x8_t *const p1q1_output,
+ uint16x8_t *const p0q0_output) {
+ // Sum p2 and q2 output from opposite directions.
+ // The formula is regrouped to allow 2 doubling operations to be combined.
+ // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+ // ^^^^^^^^
+ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+ // ^^^^^^^^
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^
+ uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+ // Add two other terms to make dual issue with shift more likely.
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^^^
+ sum = vaddq_u16(sum, p01q01);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ sum = vaddq_u16(sum, p3q3);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
+ sum = vaddq_u16(sum, q0p0);
+
+ *p2q2_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p3 - p2 + p1 + q1
+ // q1 = q2 - q3 - q2 + q0 + p1
+ sum = vsubq_u16(sum, p23q23);
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
+ sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+ *p1q1_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p3 - p1 + p0 + q2
+ // q0 = q1 - q3 - q1 + q0 + p2
+ sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+ const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
+ sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+ *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst_p3 = s - 4 * pitch;
+ uint16_t *const dst_p2 = s - 3 * pitch;
+ uint16_t *const dst_p1 = s - 2 * pitch;
+ uint16_t *const dst_p0 = s - pitch;
+ uint16_t *const dst_q0 = s;
+ uint16_t *const dst_q1 = s + pitch;
+ uint16_t *const dst_q2 = s + 2 * pitch;
+ uint16_t *const dst_q3 = s + 3 * pitch;
+
+ const uint16x4_t src[8] = { vld1_u16(dst_p3), vld1_u16(dst_p2),
+ vld1_u16(dst_p1), vld1_u16(dst_p0),
+ vld1_u16(dst_q0), vld1_u16(dst_q1),
+ vld1_u16(dst_q2), vld1_u16(dst_q3) };
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[2], src[3], src[4], src[5], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+ const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+ const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+ const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+ filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+ // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // filter8() does not apply, but filter4() applies to one or more values.
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t is_flat4_mask_8 =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+ vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+void aom_highbd_lpf_horizontal_8_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+static INLINE uint16x8_t reverse_low_half(const uint16x8_t a) {
+ return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst = s - 4;
+ uint16_t *const dst_0 = dst;
+ uint16_t *const dst_1 = dst + pitch;
+ uint16_t *const dst_2 = dst + 2 * pitch;
+ uint16_t *const dst_3 = dst + 3 * pitch;
+
+ // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+ // To get desired pairs after transpose, one half should be reversed.
+ uint16x8_t src[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3) };
+
+ // src[0] = p0q0
+ // src[1] = p1q1
+ // src[2] = p2q2
+ // src[3] = p3q3
+ loop_filter_transpose_u16_4x8q(src);
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask = outer_threshold(
+ vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+ vget_high_u16(src[1]), outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = src[0];
+ const uint16x8_t p1q1 = src[1];
+ const uint16x8_t p2q2 = src[2];
+ const uint16x8_t p3q3 = src[3];
+ filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+ // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+ // output is not used.
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // filter8() does not apply, but filter4() applies to one or more values.
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t is_flat4_mask_8 =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
+ // After transpose, |output| will contain rows of the form:
+ // p0 p1 p2 p3 q0 q1 q2 q3
+ transpose_array_inplace_u16_4x8(output);
+
+ // Reverse p values to produce original order:
+ // p3 p2 p1 p0 q0 q1 q2 q3
+ vst1q_u16(dst_0, reverse_low_half(output[0]));
+ vst1q_u16(dst_1, reverse_low_half(output[1]));
+ vst1q_u16(dst_2, reverse_low_half(output[2]));
+ vst1q_u16(dst_3, reverse_low_half(output[3]));
+}
+
+void aom_highbd_lpf_vertical_8_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+ bd);
+}
+
+static INLINE void filter14(
+ const uint16x8_t p6q6, const uint16x8_t p5q5, const uint16x8_t p4q4,
+ const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, uint16x8_t *const p5q5_output,
+ uint16x8_t *const p4q4_output, uint16x8_t *const p3q3_output,
+ uint16x8_t *const p2q2_output, uint16x8_t *const p1q1_output,
+ uint16x8_t *const p0q0_output) {
+ // Sum p5 and q5 output from opposite directions.
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^^^^^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^^^^^^^^^^^^
+ uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+ sum = vaddq_u16(sum, p6q6_x7);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^
+ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
+ sum = vaddq_u16(sum, q0p0);
+
+ *p5q5_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p4 and q4 output:
+ // p4 = p5 - (2 * p6) + p3 + q1
+ // q4 = q5 - (2 * q6) + q3 + p1
+ sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+ const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
+ sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+ *p4q4_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p3 and q3 output:
+ // p3 = p4 - p6 - p5 + p2 + q2
+ // q3 = q4 - q6 - q5 + q2 + p2
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+ const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
+ sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+ *p3q3_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p2 and q2 output:
+ // p2 = p3 - p6 - p4 + p1 + q3
+ // q2 = q3 - q6 - q4 + q1 + p3
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+ const uint16x8_t q3p3 = vextq_u16(p3q3, p3q3, 4);
+ sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+ *p2q2_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p6 - p3 + p0 + q4
+ // q1 = q2 - q6 - q3 + q0 + p4
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+ const uint16x8_t q4p4 = vextq_u16(p4q4, p4q4, 4);
+ sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+ *p1q1_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p6 - p2 + q0 + q5
+ // q0 = q1 - q6 - q2 + p0 + p5
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+ const uint16x8_t q5p5 = vextq_u16(p5q5, p5q5, 4);
+ sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+ *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst_p6 = s - 7 * pitch;
+ uint16_t *const dst_p5 = s - 6 * pitch;
+ uint16_t *const dst_p4 = s - 5 * pitch;
+ uint16_t *const dst_p3 = s - 4 * pitch;
+ uint16_t *const dst_p2 = s - 3 * pitch;
+ uint16_t *const dst_p1 = s - 2 * pitch;
+ uint16_t *const dst_p0 = s - pitch;
+ uint16_t *const dst_q0 = s;
+ uint16_t *const dst_q1 = s + pitch;
+ uint16_t *const dst_q2 = s + 2 * pitch;
+ uint16_t *const dst_q3 = s + 3 * pitch;
+ uint16_t *const dst_q4 = s + 4 * pitch;
+ uint16_t *const dst_q5 = s + 5 * pitch;
+ uint16_t *const dst_q6 = s + 6 * pitch;
+
+ const uint16x4_t src[14] = {
+ vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+ vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+ vld1_u16(dst_q5), vld1_u16(dst_q6)
+ };
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask =
+ outer_threshold(src[5], src[6], src[7], src[8], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+ const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+ const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+ const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+ filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+ const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+ const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+ const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+ // Mask to choose between the outputs of filter8 and filter14.
+ // As with the derivation of |is_flat4_mask|, the question of whether to use
+ // filter14 is only raised where |is_flat4_mask| is true.
+ const uint16x4_t is_flat4_outer_mask = vand_u16(
+ is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+ vabdq_u16(p0q0, p6q6), bd));
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+ p5q5_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+ // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // filter8() and filter14() do not apply, but filter4() applies to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t use_filter8_mask =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+ if (vget_lane_u64(need_filter14, 0) == 0) {
+ // filter14() does not apply, but filter8() and filter4() apply to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ } else {
+ // All filters may contribute values to final outputs.
+ const uint16x8_t use_filter14_mask =
+ vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+ uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+ p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+ p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+ p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+ p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+ p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+ p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+ }
+
+ vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+ vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+ vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+ vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+ vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+ vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+ vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+ vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+void aom_highbd_lpf_horizontal_14_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+static INLINE uint16x8x2_t permute_acdb64(const uint16x8_t ab,
+ const uint16x8_t cd) {
+ uint16x8x2_t acdb;
+#if AOM_ARCH_AARCH64
+ // a[b] <- [c]d
+ acdb.val[0] = vreinterpretq_u16_u64(
+ vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+ // [a]b <- c[d]
+ acdb.val[1] = vreinterpretq_u16_u64(
+ vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+ // a[b] <- [c]d
+ acdb.val[0] = vreinterpretq_u16_u64(
+ vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+ vreinterpretq_u64_u16(ab), 1));
+ // [a]b <- c[d]
+ acdb.val[1] = vreinterpretq_u16_u64(
+ vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+ vreinterpretq_u64_u16(ab), 0));
+#endif // AOM_ARCH_AARCH64
+ return acdb;
+}
+
+void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ uint16_t *const dst = s - 8;
+ uint16_t *const dst_0 = dst;
+ uint16_t *const dst_1 = dst + pitch;
+ uint16_t *const dst_2 = dst + 2 * pitch;
+ uint16_t *const dst_3 = dst + 3 * pitch;
+
+ // Low halves: p7 p6 p5 p4
+ // High halves: p3 p2 p1 p0
+ uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3) };
+ // p7 will be the low half of src_p[0]. Not used until the end.
+ transpose_array_inplace_u16_4x8(src_p);
+
+ // Low halves: q0 q1 q2 q3
+ // High halves: q4 q5 q6 q7
+ uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+ vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
+ // q7 will be the high half of src_q[3]. Not used until the end.
+ transpose_array_inplace_u16_4x8(src_q);
+
+ // Adjust thresholds to bitdepth.
+ const int outer_thresh = *blimit << (bd - 8);
+ const int inner_thresh = *limit << (bd - 8);
+ const int hev_thresh = *thresh << (bd - 8);
+ const uint16x4_t outer_mask = outer_threshold(
+ vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+ vget_low_u16(src_q[1]), outer_thresh);
+ const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+ const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+ const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+ const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // AOM_ARCH_AARCH64
+ const uint16x8_t p4q4 =
+ vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+ const uint16x8_t p5q5 =
+ vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+ const uint16x8_t p6q6 =
+ vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+ const uint16x8_t p7q7 =
+ vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+ // Mask to choose between the outputs of filter8 and filter14.
+ // As with the derivation of |is_flat4_mask|, the question of whether to use
+ // filter14 is only raised where |is_flat4_mask| is true.
+ const uint16x4_t is_flat4_outer_mask = vand_u16(
+ is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+ vabdq_u16(p0q0, p6q6), bd));
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+ filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+ p5q5_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+ // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // filter8() and filter14() do not apply, but filter4() applies to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t use_filter8_mask =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+ if (vget_lane_u64(need_filter14, 0) == 0) {
+ // filter14() does not apply, but filter8() and filter4() apply to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ } else {
+ // All filters may contribute values to final outputs.
+ const uint16x8_t use_filter14_mask =
+ vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+ uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+ p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+ p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+ p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+ p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+ p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+ p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+ }
+ // To get the correctly ordered rows from the transpose, we need:
+ // p7p3 p6p2 p5p1 p4p0
+ // q0q4 q1q5 q2q6 q3q7
+ const uint16x8x2_t p7p3_q3q7 = permute_acdb64(p7q7, p3q3_output);
+ const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output);
+ const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output);
+ const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
+ uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+ p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
+ transpose_array_inplace_u16_4x8(output_p);
+ uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+ p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
+ transpose_array_inplace_u16_4x8(output_q);
+
+ // Reverse p values to produce original order:
+ // p3 p2 p1 p0 q0 q1 q2 q3
+ vst1q_u16(dst_0, output_p[0]);
+ vst1q_u16(dst_0 + 8, output_q[0]);
+ vst1q_u16(dst_1, output_p[1]);
+ vst1q_u16(dst_1 + 8, output_q[1]);
+ vst1q_u16(dst_2, output_p[2]);
+ vst1q_u16(dst_2 + 8, output_q[2]);
+ vst1q_u16(dst_3, output_p[3]);
+ vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+void aom_highbd_lpf_vertical_14_dual_neon(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0, bd);
+ aom_highbd_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1,
+ thresh1, bd);
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c
new file mode 100644
index 0000000000..9262d818e9
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/blend.h"
+
+static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ const uint16x8_t s0 = vld1q_u16(src);
+ const uint16x8_t a0 = vld1q_u16(a);
+ const uint16x8_t b0 = vld1q_u16(b);
+ const uint16x8_t m0 = vmovl_u8(vld1_u8(m));
+
+ uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, a0, b0);
+
+ return vaddq_u16(sad, vabdq_u16(blend_u16, s0));
+}
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ sad = masked_sad_8x1_neon(sad, src, a, b, m);
+ return masked_sad_8x1_neon(sad, &src[8], &a[8], &b[8], &m[8]);
+}
+
+static INLINE uint16x8_t masked_sad_32x1_neon(uint16x8_t sad,
+ const uint16_t *src,
+ const uint16_t *a,
+ const uint16_t *b,
+ const uint8_t *m) {
+ sad = masked_sad_16x1_neon(sad, src, a, b, m);
+ return masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
+}
+
+static INLINE unsigned int masked_sad_128xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ do {
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ for (int h = 0; h < 4; ++h) {
+ sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+ sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+ sad[2] = masked_sad_32x1_neon(sad[2], &src[64], &a[64], &b[64], &m[64]);
+ sad[3] = masked_sad_32x1_neon(sad[3], &src[96], &a[96], &b[96], &m[96]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+ sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+ sad_u32[2] = vpadalq_u16(sad_u32[2], sad[2]);
+ sad_u32[3] = vpadalq_u16(sad_u32[3], sad[3]);
+ height -= 4;
+ } while (height != 0);
+
+ sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[1]);
+ sad_u32[2] = vaddq_u32(sad_u32[2], sad_u32[3]);
+ sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[2]);
+
+ return horizontal_add_u32x4(sad_u32[0]);
+}
+
+static INLINE unsigned int masked_sad_64xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+ for (int h = 0; h < 4; ++h) {
+ sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+ sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+ sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+ height -= 4;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sad_u32[0], sad_u32[1]));
+}
+
+static INLINE unsigned int masked_sad_32xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad = vdupq_n_u16(0);
+ for (int h = 0; h < 4; ++h) {
+ sad = masked_sad_32x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad);
+ height -= 4;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+
+static INLINE unsigned int masked_sad_16xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+ for (int h = 0; h < 8; ++h) {
+ sad_u16 = masked_sad_16x1_neon(sad_u16, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+ height -= 8;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE unsigned int masked_sad_8xh_large_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+ for (int h = 0; h < 16; ++h) {
+ sad_u16 = masked_sad_8x1_neon(sad_u16, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ }
+
+ sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+ height -= 16;
+ } while (height != 0);
+
+ return horizontal_add_u32x4(sad_u32);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE unsigned int masked_sad_16xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 128 elements in the
+ // uint16x8_t type sad accumulator, so we can only process up to 8 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 8);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_16x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_8xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 128 elements in the
+ // uint16x8_t type sad accumulator, so we can only process up to 16 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 16);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_8x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_4xh_small_neon(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+ int height) {
+ // For 12-bit data, we can only accumulate up to 64 elements in the
+ // uint16x4_t type sad accumulator, so we can only process up to 16 rows
+ // before we have to accumulate into 32-bit elements.
+ assert(height <= 16);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ uint16x4_t sad = vdup_n_u16(0);
+ do {
+ uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(m)));
+ uint16x4_t a0 = load_unaligned_u16_4x1(a);
+ uint16x4_t b0 = load_unaligned_u16_4x1(b);
+ uint16x4_t s0 = load_unaligned_u16_4x1(src);
+
+ uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, a0, b0);
+
+ sad = vadd_u16(sad, vabd_u16(blend_u16, s0));
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u16x4(sad);
+}
+
+#define HIGHBD_MASKED_SAD_WXH_SMALL_NEON(w, h) \
+ unsigned int aom_highbd_masked_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_##w##xh_small_neon(src, src_stride, ref, ref_stride, \
+ second_pred, w, msk, msk_stride, \
+ h); \
+ else \
+ return masked_sad_##w##xh_small_neon(src, src_stride, second_pred, w, \
+ ref, ref_stride, msk, msk_stride, \
+ h); \
+ }
+
+#define HIGHBD_MASKED_SAD_WXH_LARGE_NEON(w, h) \
+ unsigned int aom_highbd_masked_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_##w##xh_large_neon(src, src_stride, ref, ref_stride, \
+ second_pred, w, msk, msk_stride, \
+ h); \
+ else \
+ return masked_sad_##w##xh_large_neon(src, src_stride, second_pred, w, \
+ ref, ref_stride, msk, msk_stride, \
+ h); \
+ }
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 8)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 8)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 16)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 8)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 32)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 128)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 16)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(8, 32)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 4)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 8)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c
new file mode 100644
index 0000000000..28699e6f41
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
+ const int32_t *mask,
+ const int32_t *wsrc,
+ uint32x4_t *sum) {
+ int16x8_t ref_s16 = vreinterpretq_s16_u16(ref);
+
+ int32x4_t wsrc_lo = vld1q_s32(wsrc);
+ int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+
+ int32x4_t mask_lo = vld1q_s32(mask);
+ int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+ int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+ int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
+ int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
+
+ uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+ uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+ *sum = vrsraq_n_u32(*sum, abs_lo, 12);
+ *sum = vrsraq_n_u32(*sum, abs_hi, 12);
+}
+
+static INLINE unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int h = height / 2;
+ do {
+ uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+ highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+ ref_ptr += 2 * ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ do {
+ uint16x8_t r = vld1q_u16(ref_ptr);
+
+ highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+ ref_ptr += ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--height != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int width, int height) {
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ int i = 0;
+ do {
+ uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+ highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+
+ uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+ highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+
+ wsrc += 16;
+ mask += 16;
+ i += 16;
+ } while (i < width);
+
+ ref_ptr += ref_stride;
+ } while (--height != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int height) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+
+ do {
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+ uint16x8_t r2 = vld1q_u16(ref_ptr + 16);
+ uint16x8_t r3 = vld1q_u16(ref_ptr + 24);
+
+ highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+ highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+ highbd_obmc_sad_8x1_s16_neon(r2, mask + 16, wsrc + 16, &sum[2]);
+ highbd_obmc_sad_8x1_s16_neon(r3, mask + 24, wsrc + 24, &sum[3]);
+
+ wsrc += 32;
+ mask += 32;
+ ref_ptr += ref_stride;
+ } while (--height != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int h) {
+ return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
+}
+
+#define HIGHBD_OBMC_SAD_WXH_NEON(w, h) \
+ unsigned int aom_highbd_obmc_sad##w##x##h##_neon( \
+ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ return highbd_obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
+ }
+
+HIGHBD_OBMC_SAD_WXH_NEON(4, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(4, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 128)
+
+HIGHBD_OBMC_SAD_WXH_NEON(128, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_OBMC_SAD_WXH_NEON(4, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c
new file mode 100644
index 0000000000..d59224619b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ uint32x4_t *sse,
+ int32x4_t *sum) {
+ int16x8_t pre_s16 = vreinterpretq_s16_u16(pre);
+ int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
+ int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
+
+ int32x4_t mask_lo = vld1q_s32(&mask[0]);
+ int32x4_t mask_hi = vld1q_s32(&mask[4]);
+
+ int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+ int32x4_t diff_lo = vmull_s16(vget_low_s16(pre_s16), vget_low_s16(mask_s16));
+ int32x4_t diff_hi =
+ vmull_s16(vget_high_s16(pre_s16), vget_high_s16(mask_s16));
+
+ diff_lo = vsubq_s32(wsrc_lo, diff_lo);
+ diff_hi = vsubq_s32(wsrc_hi, diff_hi);
+
+ // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
+ // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
+ // This difference only affects the bit patterns at the rounding breakpoints
+ // exactly, so we can add -1 to all negative numbers to move the breakpoint
+ // one value across and into the correct rounding region.
+ diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
+ diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
+ int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
+ int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
+
+ *sum = vaddq_s32(*sum, round_lo);
+ *sum = vaddq_s32(*sum, round_hi);
+ *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_lo),
+ vreinterpretq_u32_s32(round_lo));
+ *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_hi),
+ vreinterpretq_u32_s32(round_hi));
+}
+
+// For 12-bit data, we can only accumulate up to 256 elements in the unsigned
+// 32-bit elements (4095*4095*256 = 4292870400) before we have to accumulate
+// into 64-bit elements. Therefore blocks of size 32x64, 64x32, 64x64, 64x128,
+// 128x64, 128x128 are processed in a different helper function.
+static INLINE void highbd_obmc_variance_xlarge_neon(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int width, int h, int h_limit, uint64_t *sse,
+ int64_t *sum) {
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+ // accumulator overflows. After hitting this limit we accumulate into 64-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ do {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+ int j = 0;
+
+ do {
+ int i = 0;
+
+ do {
+ uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+ highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32[0],
+ &sum_s32);
+
+ uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+ highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32[1],
+ &sum_s32);
+
+ i += 16;
+ wsrc += 16;
+ mask += 16;
+ } while (i < width);
+
+ pre_ptr += pre_stride;
+ j++;
+ } while (j < h_tmp);
+
+ sse_u64 = vpadalq_u32(sse_u64, sse_u32[0]);
+ sse_u64 = vpadalq_u32(sse_u64, sse_u32[1]);
+ h -= h_tmp;
+ } while (h != 0);
+
+ *sse = horizontal_add_u64x2(sse_u64);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_128xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 128, h, 16, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_64xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 64, h, 32, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_32xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 32, h, 64, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_large_neon(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int width, int h, uint64_t *sse, int64_t *sum) {
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ int i = 0;
+ do {
+ uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+ highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32, &sum_s32);
+
+ uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+ highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32,
+ &sum_s32);
+
+ i += 16;
+ wsrc += 16;
+ mask += 16;
+ } while (i < width);
+
+ pre_ptr += pre_stride;
+ } while (--h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_128xh(
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse,
+ sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_64xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_32xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_16xh(const uint8_t *pre,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_8xh(const uint8_t *pre8,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ uint16x8_t pre_u16 = vld1q_u16(pre);
+
+ highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+ pre += pre_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_4xh(const uint8_t *pre8,
+ int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ uint64_t *sse, int64_t *sum) {
+ assert(h % 2 == 0);
+ uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ uint16x8_t pre_u16 = load_unaligned_u16_4x2(pre, pre_stride);
+
+ highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+ pre += 2 * pre_stride;
+ wsrc += 8;
+ mask += 8;
+ h -= 2;
+ } while (h != 0);
+
+ *sse = horizontal_long_add_u32x4(sse_u32);
+ *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)sum64;
+ *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+ int *sum, unsigned int *sse) {
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VARIANCE_WXH_NEON(w, h, bitdepth) \
+ unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t sum64; \
+ uint64_t sse64; \
+ highbd_obmc_variance_neon_##w##xh(pre, pre_stride, wsrc, mask, h, &sse64, \
+ &sum64); \
+ highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+#define HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(w, h, bitdepth) \
+ unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t sum64; \
+ uint64_t sse64; \
+ highbd_obmc_variance_xlarge_neon_##w##xh(pre, pre_stride, wsrc, mask, h, \
+ &sse64, &sum64); \
+ highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+// 8-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 8)
+
+// 10-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 10)
+
+// 12-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(32, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 128, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 128, 12)
diff --git a/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c b/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 0000000000..6149c9f13e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/quantize.h"
+
+static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+ return (uint32_t)vget_lane_u64(c, 0);
+#endif
+}
+
+static INLINE uint16x4_t
+quantize_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+ int32x4_t v_dequant_s32, int32x4_t v_round_s32, int32x4_t v_zbin_s32,
+ int32x4_t v_quant_shift_s32, int log_scale) {
+ const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+ const int32x4_t v_coeff_sign =
+ vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+ const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+ // if (abs_coeff < zbins[rc != 0]),
+ const uint32x4_t v_zbin_mask = vcgeq_s32(v_abs_coeff, v_zbin_s32);
+ const int32x4_t v_log_scale = vdupq_n_s32(log_scale);
+ // const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+ const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+ // const int32_t tmpw32 = tmp * wt;
+ const int32x4_t v_tmpw32 = vmulq_s32(v_tmp, vdupq_n_s32((1 << AOM_QM_BITS)));
+ // const int32_t tmp2 = (int32_t)((tmpw32 * quant64) >> 16);
+ const int32x4_t v_tmp2 = vqdmulhq_s32(v_tmpw32, v_quant_s32);
+ // const int32_t tmp3 =
+ // ((((tmp2 + tmpw32)<< log_scale) * (int64_t)(quant_shift << 15)) >> 32);
+ const int32x4_t v_tmp3 = vqdmulhq_s32(
+ vshlq_s32(vaddq_s32(v_tmp2, v_tmpw32), v_log_scale), v_quant_shift_s32);
+ // const int abs_qcoeff = vmask ? (int)tmp3 >> AOM_QM_BITS : 0;
+ const int32x4_t v_abs_qcoeff = vandq_s32(vreinterpretq_s32_u32(v_zbin_mask),
+ vshrq_n_s32(v_tmp3, AOM_QM_BITS));
+ // const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant_iwt) >> log_scale;
+ // vshlq_s32 will shift right if shift value is negative.
+ const int32x4_t v_abs_dqcoeff =
+ vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale));
+ // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_qcoeff =
+ vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+ // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_dqcoeff =
+ vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+ vst1q_s32(qcoeff_ptr, v_qcoeff);
+ vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+ // Used to find eob.
+ const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0));
+ return vmovn_u32(nz_qcoeff_mask);
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+ int16x8_t v_eobmax,
+ uint16x8_t v_mask) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+ const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+ return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void get_min_max_lane_eob(const int16_t *iscan,
+ int16x8_t *v_eobmin,
+ int16x8_t *v_eobmax, uint16x8_t v_mask,
+ intptr_t n_coeffs) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_nz_iscan_max = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
+#if SKIP_EOB_FACTOR_ADJUST
+ const int16x8_t v_nz_iscan_min =
+ vbslq_s16(v_mask, v_iscan, vdupq_n_s16((int16_t)n_coeffs));
+ *v_eobmin = vminq_s16(*v_eobmin, v_nz_iscan_min);
+#else
+ (void)v_eobmin;
+#endif
+ *v_eobmax = vmaxq_s16(*v_eobmax, v_nz_iscan_max);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if AOM_ARCH_AARCH64
+ return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+ return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+#if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
+static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
+#if AOM_ARCH_AARCH64
+ return (uint16_t)vminvq_s16(v_eobmin);
+#else
+ const int16x4_t v_eobmin_3210 =
+ vmin_s16(vget_low_s16(v_eobmin), vget_high_s16(v_eobmin));
+ const int64x1_t v_eobmin_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmin_3210), 32);
+ const int16x4_t v_eobmin_tmp =
+ vmin_s16(v_eobmin_3210, vreinterpret_s16_s64(v_eobmin_xx32));
+ const int64x1_t v_eobmin_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmin_tmp), 16);
+ const int16x4_t v_eobmin_final =
+ vmin_s16(v_eobmin_tmp, vreinterpret_s16_s64(v_eobmin_xxx3));
+ return (uint16_t)vget_lane_s16(v_eobmin_final, 0);
+#endif
+}
+#endif // SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
+
+static void highbd_quantize_b_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const int log_scale) {
+ (void)scan;
+ const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+ const int16x4_t v_zero = vdup_n_s16(0);
+ const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+ const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+ const int16x4_t v_round_log_scale =
+ vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+ const int16x4_t v_round =
+ vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+ const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr);
+ const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr);
+ const int16x4_t v_zbin_log_scale =
+ vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale)));
+ const int16x4_t v_zbin =
+ vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale);
+ int32x4_t v_round_s32 = vmovl_s16(v_round);
+ int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15);
+ int32x4_t v_dequant_s32 = vmovl_s16(v_dequant);
+ int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15);
+ int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
+ uint16x4_t v_mask_lo, v_mask_hi;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+ intptr_t non_zero_count = n_coeffs;
+
+ assert(n_coeffs > 8);
+ // Pre-scan pass
+ const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+ intptr_t i = n_coeffs;
+ do {
+ const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4);
+ const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8);
+ const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a);
+ const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b);
+ const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_s32x);
+ const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_s32x);
+ // If the coefficient is in the base ZBIN range, then discard.
+ if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
+ non_zero_count -= 8;
+ } else {
+ break;
+ }
+ i -= 8;
+ } while (i > 0);
+
+ const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
+ memset(qcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*dqcoeff_ptr));
+
+ // DC and first 3 AC
+ v_mask_lo =
+ quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32,
+ v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale);
+
+ // overwrite the DC constants with AC constants
+ v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+ v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+ v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+ v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1);
+ v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+
+ // 4 more AC
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+ intptr_t count = non_zero_count - 8;
+ for (; count > 0; count -= 8) {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+ v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+ // Find the max lane eob for 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+ }
+
+ *eob_ptr = get_max_eob(v_eobmax);
+}
+
+void aom_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, 0);
+}
+
+void aom_highbd_quantize_b_32x32_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, 1);
+}
+
+void aom_highbd_quantize_b_64x64_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, 2);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void highbd_quantize_b_adaptive_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const int log_scale) {
+ (void)scan;
+ const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+ const int16x4_t v_zero = vdup_n_s16(0);
+ const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+ const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+ const int16x4_t v_round_log_scale =
+ vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+ const int16x4_t v_round =
+ vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+ const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr);
+ const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr);
+ const int16x4_t v_zbin_log_scale =
+ vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale)));
+ const int16x4_t v_zbin =
+ vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale);
+ int32x4_t v_round_s32 = vmovl_s16(v_round);
+ int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15);
+ int32x4_t v_dequant_s32 = vmovl_s16(v_dequant);
+ int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15);
+ int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
+ uint16x4_t v_mask_lo, v_mask_hi;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+ int16x8_t v_eobmin = vdupq_n_s16((int16_t)n_coeffs);
+
+ assert(n_coeffs > 8);
+ // Pre-scan pass
+ const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+ const int prescan_add_1 =
+ ROUND_POWER_OF_TWO(dequant_ptr[1] * EOB_FACTOR, 7 + AOM_QM_BITS);
+ const int32x4_t v_zbin_prescan =
+ vaddq_s32(v_zbin_s32x, vdupq_n_s32(prescan_add_1));
+ intptr_t non_zero_count = n_coeffs;
+ intptr_t i = n_coeffs;
+ do {
+ const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4);
+ const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8);
+ const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a);
+ const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b);
+ const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_prescan);
+ const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_prescan);
+ // If the coefficient is in the base ZBIN range, then discard.
+ if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
+ non_zero_count -= 8;
+ } else {
+ break;
+ }
+ i -= 8;
+ } while (i > 0);
+
+ const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
+ memset(qcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*dqcoeff_ptr));
+
+ // DC and first 3 AC
+ v_mask_lo =
+ quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32,
+ v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale);
+
+ // overwrite the DC constants with AC constants
+ v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+ v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+ v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+ v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1);
+ v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+
+ // 4 more AC
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+
+ get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax,
+ vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs);
+
+ intptr_t count = non_zero_count - 8;
+ for (; count > 0; count -= 8) {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+ v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+ v_quant_shift_s32, log_scale);
+
+ get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax,
+ vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs);
+ }
+
+ int eob = get_max_eob(v_eobmax);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ const int first = get_min_eob(v_eobmin);
+ if (eob >= 0 && first == eob) {
+ const int rc = scan[eob];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ eob = -1;
+ }
+ }
+ }
+#endif // SKIP_EOB_FACTOR_ADJUST
+ *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_b_adaptive_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_adaptive_neon(
+ coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_adaptive_neon(
+ coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_adaptive_neon(
+ coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 2);
+}
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 0000000000..d51f639de6
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr);
+ uint16x4_t r = vld1_u16(ref16_ptr);
+ sum = vabal_u16(sum, s, r);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ sum = vabaq_u16(sum, s, r);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ uint16x8_t sum_u16 = vabdq_u16(s, r);
+ sum_u32 = vpadalq_u16(sum_u32, sum_u16);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr);
+ uint16x8_t r0 = vld1q_u16(ref16_ptr);
+ uint16x8_t diff0 = vabdq_u16(s0, r0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + 8);
+ uint16x8_t r1 = vld1q_u16(ref16_ptr + 8);
+ uint16x8_t diff1 = vabdq_u16(s1, r1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + j);
+ uint16x8_t r0 = vld1q_u16(ref16_ptr + j);
+ uint16x8_t diff0 = vabdq_u16(s0, r0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + j + 8);
+ uint16x8_t r1 = vld1q_u16(ref16_ptr + j + 8);
+ uint16x8_t diff1 = vabdq_u16(s1, r1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ uint16x8_t s2 = vld1q_u16(src16_ptr + j + 16);
+ uint16x8_t r2 = vld1q_u16(ref16_ptr + j + 16);
+ uint16x8_t diff2 = vabdq_u16(s2, r2);
+ sum[2] = vpadalq_u16(sum[2], diff2);
+
+ uint16x8_t s3 = vld1q_u16(src16_ptr + j + 24);
+ uint16x8_t r3 = vld1q_u16(ref16_ptr + j + 24);
+ uint16x8_t diff3 = vabdq_u16(s3, r3);
+ sum[3] = vpadalq_u16(sum[3], diff3);
+
+ j += 32;
+ } while (j < w);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h);
+}
+
+static INLINE unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ h);
+}
+
+static INLINE unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+ h);
+}
+
+#define HBD_SAD_WXH_SMALL_NEON(w, h) \
+ unsigned int aom_highbd_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad##w##xh_small_neon(src, src_stride, ref, ref_stride, \
+ (h)); \
+ }
+
+#define HBD_SAD_WXH_LARGE_NEON(w, h) \
+ unsigned int aom_highbd_sad##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return highbd_sad##w##xh_large_neon(src, src_stride, ref, ref_stride, \
+ (h)); \
+ }
+
+HBD_SAD_WXH_SMALL_NEON(4, 4)
+HBD_SAD_WXH_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_SMALL_NEON(8, 4)
+HBD_SAD_WXH_SMALL_NEON(8, 8)
+HBD_SAD_WXH_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_LARGE_NEON(16, 8)
+HBD_SAD_WXH_LARGE_NEON(16, 16)
+HBD_SAD_WXH_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_LARGE_NEON(32, 16)
+HBD_SAD_WXH_LARGE_NEON(32, 32)
+HBD_SAD_WXH_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_LARGE_NEON(64, 32)
+HBD_SAD_WXH_LARGE_NEON(64, 64)
+HBD_SAD_WXH_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_LARGE_NEON(128, 64)
+HBD_SAD_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_LARGE_NEON(16, 4)
+HBD_SAD_WXH_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define HBD_SAD_SKIP_WXH_SMALL_NEON(w, h) \
+ unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad##w##xh_small_neon(src, 2 * src_stride, ref, \
+ 2 * ref_stride, (h) / 2); \
+ }
+
+#define HBD_SAD_SKIP_WXH_LARGE_NEON(w, h) \
+ unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * highbd_sad##w##xh_large_neon(src, 2 * src_stride, ref, \
+ 2 * ref_stride, (h) / 2); \
+ }
+
+HBD_SAD_SKIP_WXH_SMALL_NEON(4, 4)
+HBD_SAD_SKIP_WXH_SMALL_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 4)
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 8)
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 8)
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 16)
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 16)
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 32)
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 32)
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 64)
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 128)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(128, 64)
+HBD_SAD_SKIP_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_SKIP_WXH_SMALL_NEON(4, 16)
+
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 32)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 4)
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 64)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 8)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr);
+ uint16x4_t r = vld1_u16(ref16_ptr);
+ uint16x4_t p = vld1_u16(pred16_ptr);
+
+ uint16x4_t avg = vrhadd_u16(r, p);
+ sum = vabal_u16(sum, s, avg);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 4;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr);
+ uint16x8_t r = vld1q_u16(ref16_ptr);
+ uint16x8_t p = vld1q_u16(pred16_ptr);
+
+ uint16x8_t avg = vrhaddq_u16(r, p);
+ uint16x8_t diff = vabdq_u16(s, avg);
+ sum = vpadalq_u16(sum, diff);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint16x8_t s0, s1, r0, r1, p0, p1;
+ uint16x8_t avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u16(src16_ptr);
+ r0 = vld1q_u16(ref16_ptr);
+ p0 = vld1q_u16(pred16_ptr);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + 8);
+ r1 = vld1q_u16(ref16_ptr + 8);
+ p1 = vld1q_u16(pred16_ptr + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += 16;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+ const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+ uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u16(src16_ptr + j);
+ r0 = vld1q_u16(ref16_ptr + j);
+ p0 = vld1q_u16(pred16_ptr + j);
+ avg0 = vrhaddq_u16(r0, p0);
+ diff0 = vabdq_u16(s0, avg0);
+ sum[0] = vpadalq_u16(sum[0], diff0);
+
+ s1 = vld1q_u16(src16_ptr + j + 8);
+ r1 = vld1q_u16(ref16_ptr + j + 8);
+ p1 = vld1q_u16(pred16_ptr + j + 8);
+ avg1 = vrhaddq_u16(r1, p1);
+ diff1 = vabdq_u16(s1, avg1);
+ sum[1] = vpadalq_u16(sum[1], diff1);
+
+ s2 = vld1q_u16(src16_ptr + j + 16);
+ r2 = vld1q_u16(ref16_ptr + j + 16);
+ p2 = vld1q_u16(pred16_ptr + j + 16);
+ avg2 = vrhaddq_u16(r2, p2);
+ diff2 = vabdq_u16(s2, avg2);
+ sum[2] = vpadalq_u16(sum[2], diff2);
+
+ s3 = vld1q_u16(src16_ptr + j + 24);
+ r3 = vld1q_u16(ref16_ptr + j + 24);
+ p3 = vld1q_u16(pred16_ptr + j + 24);
+ avg3 = vrhaddq_u16(r3, p3);
+ diff3 = vabdq_u16(s3, avg3);
+ sum[3] = vpadalq_u16(sum[3], diff3);
+
+ j += 32;
+ } while (j < w);
+
+ src16_ptr += src_stride;
+ ref16_ptr += ref_stride;
+ pred16_ptr += w;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad128xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h, second_pred);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+ second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+ second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h) \
+ uint32_t aom_highbd_sad##w##x##h##_avg_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
+HBD_SAD_WXH_AVG_NEON(64, 128)
+
+HBD_SAD_WXH_AVG_NEON(128, 64)
+HBD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_AVG_NEON(4, 16)
+
+HBD_SAD_WXH_AVG_NEON(8, 32)
+
+HBD_SAD_WXH_AVG_NEON(16, 4)
+HBD_SAD_WXH_AVG_NEON(16, 64)
+
+HBD_SAD_WXH_AVG_NEON(32, 8)
+
+HBD_SAD_WXH_AVG_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c b/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c
new file mode 100644
index 0000000000..85ca6732a8
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+ uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+ uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+ uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+ uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+ sum[0] = vabal_u16(sum[0], s, r0);
+ sum[1] = vabal_u16(sum[1], s, r1);
+ sum[2] = vabal_u16(sum[2], s, r2);
+ sum[3] = vabal_u16(sum[3], s, r3);
+
+ } while (++i < h);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+ sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+ sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+ sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+ sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+ } while (++i < h);
+
+ sum_u32[0] = vpaddlq_u16(sum[0]);
+ sum_u32[1] = vpaddlq_u16(sum[1]);
+ sum_u32[2] = vpaddlq_u16(sum[2]);
+ sum_u32[3] = vpaddlq_u16(sum[3]);
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+ uint32x4_t *const sad_sum) {
+ uint16x8_t abs_diff = vabdq_u16(src, ref);
+ *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
+ sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
+ sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
+ sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
+
+ } while (++i < h);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+ const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+ uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+ sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+ &sum_lo[0]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+ &sum_lo[1]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+ &sum_lo[2]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+ &sum_lo[3]);
+
+ uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+ sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+ &sum_hi[0]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+ &sum_hi[1]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+ &sum_hi[2]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+ &sum_hi[3]);
+
+ j += 32;
+ } while (j < w);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad128xhx4d_large_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4], int h) {
+ highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+ 128, h);
+}
+
+static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+ h);
+}
+
+static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+ h);
+}
+
+#define HBD_SAD_WXH_4D_SMALL_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+#define HBD_SAD_WXH_4D_LARGE_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+HBD_SAD_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h) \
+ void aom_highbd_sad_skip_##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array, \
+ 2 * ref_stride, sad_array, ((h) >> 1)); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+#define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h) \
+ void aom_highbd_sad_skip_##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array, \
+ 2 * ref_stride, sad_array, ((h) >> 1)); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+ uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+ uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+ uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+
+ sum[0] = vabal_u16(sum[0], s, r0);
+ sum[1] = vabal_u16(sum[1], s, r1);
+ sum[2] = vabal_u16(sum[2], s, r2);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+ sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+ sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+ sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(vpaddlq_u16(sum[0]));
+ res[1] = horizontal_add_u32x4(vpaddlq_u16(sum[1]));
+ res[2] = horizontal_add_u32x4(vpaddlq_u16(sum[2]));
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+ uint16x8_t r0 = vld1q_u16(ref16_ptr0 + i * ref_stride);
+ uint16x8_t r1 = vld1q_u16(ref16_ptr1 + i * ref_stride);
+ uint16x8_t r2 = vld1q_u16(ref16_ptr2 + i * ref_stride);
+
+ sad8_neon(s, r0, &sum[0]);
+ sad8_neon(s, r1, &sum[1]);
+ sad8_neon(s, r2, &sum[2]);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+
+ } while (++i < h);
+
+ res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+ res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+ res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+ const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+ const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+ const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum[3];
+
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+ sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+ sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+
+ uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+ sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+ sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+
+ uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+ sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+ &sum_lo[0]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+ &sum_lo[1]);
+ sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+ &sum_lo[2]);
+
+ uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+ sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+ &sum_hi[0]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+ &sum_hi[1]);
+ sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+ &sum_hi[2]);
+
+ j += 32;
+ } while (j < w);
+
+ } while (++i < h);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad128xhx3d_large_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4], int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+ 128, h);
+}
+
+static INLINE void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+ h);
+}
+
+static INLINE void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *const ref_ptr[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+ h);
+}
+
+#define HBD_SAD_WXH_3D_SMALL_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x3d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx3d_small_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+#define HBD_SAD_WXH_3D_LARGE_NEON(w, h) \
+ void aom_highbd_sad##w##x##h##x3d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ highbd_sad##w##xhx3d_large_neon(src, src_stride, ref_array, ref_stride, \
+ sad_array, (h)); \
+ }
+
+HBD_SAD_WXH_3D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_3D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_3D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_3D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_sse_neon.c b/third_party/aom/aom_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 0000000000..184e9f9bef
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+ const uint16_t *ref,
+ uint32x4_t *sse_acc0,
+ uint32x4_t *sse_acc1) {
+ uint16x8_t s = vld1q_u16(src);
+ uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+ *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+ uint32x4_t *sse_acc0,
+ uint32x4_t *sse_acc1) {
+ uint16x8_t s = vld1q_u16(src);
+ uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+ *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[16];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+ highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+ highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+ highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+ highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+ highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x16(sse);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[8];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[8];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[4];
+ highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2];
+ highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ // Peel the first loop iteration.
+ uint16x4_t s = vld1_u16(src);
+ uint16x4_t r = vld1_u16(ref);
+
+ uint16x4_t abs_diff = vabd_u16(s, r);
+ uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+
+ while (--height != 0) {
+ s = vld1_u16(src);
+ r = vld1_u16(ref);
+
+ abs_diff = vabd_u16(s, r);
+ sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ return horizontal_long_add_u32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int width, int height) {
+ // { 0, 1, 2, 3, 4, 5, 6, 7 }
+ uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+ uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+ uint64_t sse = 0;
+
+ do {
+ int w = width;
+ int offset = 0;
+
+ do {
+ uint16x8_t s = vld1q_u16(src + offset);
+ uint16x8_t r = vld1q_u16(ref + offset);
+
+ if (w < 8) {
+ // Mask out-of-range elements.
+ s = vandq_u16(s, remainder_mask);
+ r = vandq_u16(r, remainder_mask);
+ }
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+ uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+ uint32x4_t sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+ sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ offset += 8;
+ w -= 8;
+ } while (w > 0);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ return sse;
+}
+
+int64_t aom_highbd_sse_neon(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride, int width,
+ int height) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+ switch (width) {
+ case 4:
+ return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+ case 8:
+ return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+ case 16:
+ return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+ case 32:
+ return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+ case 64:
+ return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+ case 128:
+ return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+ default:
+ return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+ height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_sse_sve.c b/third_party/aom/aom_dsp/arm/highbd_sse_sve.c
new file mode 100644
index 0000000000..b267da5cfb
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sse_sve.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+ uint64x2_t *sse) {
+ uint16x8_t s = vld1q_u16(src);
+ uint16x8_t r = vld1q_u16(ref);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+
+ *sse = aom_udotq_u16(*sse, abs_diff, abs_diff);
+}
+
+static INLINE int64_t highbd_sse_128xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+ vdupq_n_u64(0) };
+
+ do {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]);
+ highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[3]);
+ highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ sse[0] = vaddq_u64(sse[0], sse[1]);
+ sse[2] = vaddq_u64(sse[2], sse[3]);
+ sse[0] = vaddq_u64(sse[0], sse[2]);
+ return vaddvq_u64(sse[0]);
+}
+
+static INLINE int64_t highbd_sse_64xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+ vdupq_n_u64(0) };
+
+ do {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]);
+ highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ sse[0] = vaddq_u64(sse[0], sse[1]);
+ sse[2] = vaddq_u64(sse[2], sse[3]);
+ sse[0] = vaddq_u64(sse[0], sse[2]);
+ return vaddvq_u64(sse[0]);
+}
+
+static INLINE int64_t highbd_sse_32xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+ vdupq_n_u64(0) };
+
+ do {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+ highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]);
+ highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ sse[0] = vaddq_u64(sse[0], sse[1]);
+ sse[2] = vaddq_u64(sse[2], sse[3]);
+ sse[0] = vaddq_u64(sse[0], sse[2]);
+ return vaddvq_u64(sse[0]);
+}
+
+static INLINE int64_t highbd_sse_16xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ do {
+ highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+ highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ return vaddvq_u64(vaddq_u64(sse[0], sse[1]));
+}
+
+static INLINE int64_t highbd_sse_8xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ do {
+ highbd_sse_8x1_neon(src + 0 * src_stride, ref + 0 * ref_stride, &sse[0]);
+ highbd_sse_8x1_neon(src + 1 * src_stride, ref + 1 * ref_stride, &sse[1]);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ height -= 2;
+ } while (height != 0);
+
+ return vaddvq_u64(vaddq_u64(sse[0], sse[1]));
+}
+
+static INLINE int64_t highbd_sse_4xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int height) {
+ uint64x2_t sse = vdupq_n_u64(0);
+
+ do {
+ uint16x8_t s = load_unaligned_u16_4x2(src, src_stride);
+ uint16x8_t r = load_unaligned_u16_4x2(ref, ref_stride);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ sse = aom_udotq_u16(sse, abs_diff, abs_diff);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ height -= 2;
+ } while (height != 0);
+
+ return vaddvq_u64(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int width, int height) {
+ svuint64_t sse = svdup_n_u64(0);
+ uint64_t step = svcnth();
+
+ do {
+ int w = 0;
+ const uint16_t *src_ptr = src;
+ const uint16_t *ref_ptr = ref;
+
+ do {
+ svbool_t pred = svwhilelt_b16_u32(w, width);
+ svuint16_t s = svld1_u16(pred, src_ptr);
+ svuint16_t r = svld1_u16(pred, ref_ptr);
+
+ svuint16_t abs_diff = svabd_u16_z(pred, s, r);
+
+ sse = svdot_u64(sse, abs_diff, abs_diff);
+
+ src_ptr += step;
+ ref_ptr += step;
+ w += step;
+ } while (w < width);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--height != 0);
+
+ return svaddv_u64(svptrue_b64(), sse);
+}
+
+int64_t aom_highbd_sse_sve(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride, int width,
+ int height) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+ switch (width) {
+ case 4: return highbd_sse_4xh_sve(src, src_stride, ref, ref_stride, height);
+ case 8: return highbd_sse_8xh_sve(src, src_stride, ref, ref_stride, height);
+ case 16:
+ return highbd_sse_16xh_sve(src, src_stride, ref, ref_stride, height);
+ case 32:
+ return highbd_sse_32xh_sve(src, src_stride, ref, ref_stride, height);
+ case 64:
+ return highbd_sse_64xh_sve(src, src_stride, ref, ref_stride, height);
+ case 128:
+ return highbd_sse_128xh_sve(src, src_stride, ref, ref_stride, height);
+ default:
+ return highbd_sse_wxh_sve(src, src_stride, ref, ref_stride, width,
+ height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000000..686fa5f226
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,1497 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, blend);
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height,
+ int filter_offset) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, blend);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 8, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 16, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 32, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr,
+ uint16_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height,
+ int filter_offset) {
+ highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+ 128, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+ uint16_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ \
+ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+ w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+ h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
+ src_stride, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
+ xoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+ uint16x4_t p = vld1_u16(second_pred);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ second_pred += 4;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint16_t *second_pred) {
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 8, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w128(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred) {
+ highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 128, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with aom_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+ uint16x8_t p = vld1q_u16(second_pred);
+ avg = vrhaddq_u16(avg, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+ int src_stride, int dst_width, int dst_height,
+ const uint16_t *second_pred) {
+ int i = dst_height;
+
+ // We only specialize on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t avg = vrhaddq_u16(s, p);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ j += 8;
+ second_pred += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ \
+ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+ w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ if (xoffset == 0) { \
+ uint16_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp, source_stride, source_stride, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } else { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp, source_stride, source_stride, h, yoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ src_ptr, tmp0, source_stride, 1, w, h, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
+ (h + 1)); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ src_ptr, tmp0, source_stride, 1, h, xoffset, \
+ CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_avg( \
+ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * (h + 1)]; \
+ uint16_t tmp2[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \
+ h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \
+ w, ref, ref_stride, sse); \
+ }
+
+#define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ if (xoffset == 0) { \
+ uint16_t tmp0[w * h]; \
+ if (yoffset == 0) { \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred, \
+ w, h, src, src_stride, msk, msk_stride, \
+ invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride, \
+ w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, \
+ src_stride, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
+ (h + 1)); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ if (yoffset == 0) { \
+ uint16_t tmp0[w * h]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
+ xoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * (h + 1)]; \
+ uint16_t tmp2[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
+ (h + 1), xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \
+ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+ msk_stride, invert_mask); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#if !CONFIG_REALTIME_ONLY
+#define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \
+ xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ }
+
+#define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ pre, pre_stride, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \
+ h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride, \
+ pre_stride, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h, \
+ xoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \
+ h + 1, xoffset); \
+ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \
+ h + 1, xoffset); \
+ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \
+ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+// 10-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+// 12-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+#endif // !CONFIG_REALTIME_ONLY
+
+static void highbd_dist_wtd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+ int src_stride, int dst_width,
+ int dst_height,
+ const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t avg = dist_wtd_avg_u16x8(s, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_avg(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+ uint16x8_t avg = vrhaddq_u16(s0, s1);
+ avg = dist_wtd_avg_u16x8(avg, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x4_t fwd_offset = vdup_n_u16(jcp_param->fwd_offset);
+ const uint16x4_t bck_offset = vdup_n_u16(jcp_param->bck_offset);
+ const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+ const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+ uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+ uint16x4_t p = vld1_u16(second_pred);
+
+ uint16x4_t blend = vmul_u16(s0, f0);
+ blend = vmla_u16(blend, s1, f1);
+ blend = vrshr_n_u16(blend, 3);
+
+ uint16x4_t avg = dist_wtd_avg_u16x4(blend, p, fwd_offset, bck_offset);
+
+ vst1_u16(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 4;
+ second_pred += 4;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint16_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+ const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+ const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+ const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+ uint16x8_t p = vld1q_u16(second_pred);
+
+ uint16x8_t blend = vmulq_u16(s0, f0);
+ blend = vmlaq_u16(blend, s1, f1);
+ blend = vrshrq_n_u16(blend, 3);
+
+ uint16x8_t avg = dist_wtd_avg_u16x8(blend, p, fwd_offset, bck_offset);
+
+ vst1q_u16(dst_ptr + j, avg);
+
+ second_pred += 8;
+ j += 8;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+ const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint16_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+#define HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ unsigned int \
+ aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \
+ if (xoffset == 0) { \
+ uint16_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred(src, tmp, source_stride, w, h, second, \
+ jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp, source_stride, source_stride, w, h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second, \
+ jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp0, source_stride, 1, w, h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \
+ h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * (h + 1)]; \
+ highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } \
+ } else { \
+ uint16_t tmp0[w * (h + 1)]; \
+ if (yoffset == 0) { \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp0, source_stride, 1, h, xoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \
+ h, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } else { \
+ uint16_t tmp1[w * h]; \
+ highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+ xoffset); \
+ highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \
+ return aom_highbd_##bitdepth##_variance##w##x##h( \
+ CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \
+ } \
+ } \
+ }
+
+// 8-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 0000000000..18b8efff4c
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+
+ int i = h;
+ do {
+ const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride);
+ const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_s16x8(sum_s16);
+ *sse = horizontal_add_s32x4(sse_s32);
+}
+
+// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all
+// block sizes can be processed in 32-bit elements (1023*1023*128*32 =
+// 4286582784 for a 128x128 block).
+static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t s = vld1q_u16(src_ptr + j);
+ const uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s32 = vpadalq_s16(sum_s32, diff);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_s32x4(sum_s32);
+ *sse = horizontal_long_add_u32x4(vaddq_u32(
+ vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1])));
+}
+
+static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum);
+}
+
+static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum);
+}
+
+static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+static INLINE void highbd_variance_128xh_neon(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_neon(src, src_stride, ref, ref_stride, 128, h, sse,
+ sum);
+}
+
+// For 12-bit data, we can only accumulate up to 128 elements in the sum of
+// squares (4095*4095*128 = 2146435200), and because we're using two int32x4
+// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128)
+// or 16 64-element rows before we have to accumulate into 64-bit elements.
+// Therefore blocks of size 32x64, 64x32, 64x64, 64x128, 128x64, 128x128 are
+// processed in a different helper function.
+
+// Process a block of any size where the width is divisible by 8, with
+// accumulation into 64-bit elements.
+static INLINE void highbd_variance_xlarge_neon(
+ const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
+ int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+ // accumulator overflows. After hitting this limit we accumulate into 64-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ int i = 0;
+ do {
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ do {
+ int j = 0;
+ do {
+ const uint16x8_t s0 = vld1q_u16(src_ptr + j);
+ const uint16x8_t r0 = vld1q_u16(ref_ptr + j);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ sum_s32 = vpadalq_s16(sum_s32, diff);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ i++;
+ } while (i < h_tmp);
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]);
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]);
+ h_tmp += h_limit;
+ } while (i < h);
+
+ *sum = horizontal_add_s32x4(sum_s32);
+ *sse = (uint64_t)horizontal_add_s64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_32xh_xlarge_neon(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse, int64_t *sum) {
+ highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse,
+ sum);
+}
+
+static INLINE void highbd_variance_64xh_xlarge_neon(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse, int64_t *sum) {
+ highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse,
+ sum);
+}
+
+static INLINE void highbd_variance_128xh_xlarge_neon(
+ const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse, int64_t *sum) {
+ highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 128, h, 8, sse,
+ sum);
+}
+
+#define HBD_VARIANCE_WXH_8_NEON(w, h) \
+ uint32_t aom_highbd_8_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ sum = (int)sum_long; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+#define HBD_VARIANCE_WXH_10_NEON(w, h) \
+ uint32_t aom_highbd_10_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HBD_VARIANCE_WXH_12_NEON(w, h) \
+ uint32_t aom_highbd_12_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h) \
+ uint32_t aom_highbd_12_variance##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 4)
+HBD_VARIANCE_WXH_8_NEON(4, 8)
+
+HBD_VARIANCE_WXH_8_NEON(8, 4)
+HBD_VARIANCE_WXH_8_NEON(8, 8)
+HBD_VARIANCE_WXH_8_NEON(8, 16)
+
+HBD_VARIANCE_WXH_8_NEON(16, 8)
+HBD_VARIANCE_WXH_8_NEON(16, 16)
+HBD_VARIANCE_WXH_8_NEON(16, 32)
+
+HBD_VARIANCE_WXH_8_NEON(32, 16)
+HBD_VARIANCE_WXH_8_NEON(32, 32)
+HBD_VARIANCE_WXH_8_NEON(32, 64)
+
+HBD_VARIANCE_WXH_8_NEON(64, 32)
+HBD_VARIANCE_WXH_8_NEON(64, 64)
+HBD_VARIANCE_WXH_8_NEON(64, 128)
+
+HBD_VARIANCE_WXH_8_NEON(128, 64)
+HBD_VARIANCE_WXH_8_NEON(128, 128)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 4)
+HBD_VARIANCE_WXH_10_NEON(4, 8)
+
+HBD_VARIANCE_WXH_10_NEON(8, 4)
+HBD_VARIANCE_WXH_10_NEON(8, 8)
+HBD_VARIANCE_WXH_10_NEON(8, 16)
+
+HBD_VARIANCE_WXH_10_NEON(16, 8)
+HBD_VARIANCE_WXH_10_NEON(16, 16)
+HBD_VARIANCE_WXH_10_NEON(16, 32)
+
+HBD_VARIANCE_WXH_10_NEON(32, 16)
+HBD_VARIANCE_WXH_10_NEON(32, 32)
+HBD_VARIANCE_WXH_10_NEON(32, 64)
+
+HBD_VARIANCE_WXH_10_NEON(64, 32)
+HBD_VARIANCE_WXH_10_NEON(64, 64)
+HBD_VARIANCE_WXH_10_NEON(64, 128)
+
+HBD_VARIANCE_WXH_10_NEON(128, 64)
+HBD_VARIANCE_WXH_10_NEON(128, 128)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 4)
+HBD_VARIANCE_WXH_12_NEON(4, 8)
+
+HBD_VARIANCE_WXH_12_NEON(8, 4)
+HBD_VARIANCE_WXH_12_NEON(8, 8)
+HBD_VARIANCE_WXH_12_NEON(8, 16)
+
+HBD_VARIANCE_WXH_12_NEON(16, 8)
+HBD_VARIANCE_WXH_12_NEON(16, 16)
+HBD_VARIANCE_WXH_12_NEON(16, 32)
+
+HBD_VARIANCE_WXH_12_NEON(32, 16)
+HBD_VARIANCE_WXH_12_NEON(32, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 128)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(128, 64)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 16)
+
+HBD_VARIANCE_WXH_8_NEON(8, 32)
+
+HBD_VARIANCE_WXH_8_NEON(16, 4)
+HBD_VARIANCE_WXH_8_NEON(16, 64)
+
+HBD_VARIANCE_WXH_8_NEON(32, 8)
+
+HBD_VARIANCE_WXH_8_NEON(64, 16)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 16)
+
+HBD_VARIANCE_WXH_10_NEON(8, 32)
+
+HBD_VARIANCE_WXH_10_NEON(16, 4)
+HBD_VARIANCE_WXH_10_NEON(16, 64)
+
+HBD_VARIANCE_WXH_10_NEON(32, 8)
+
+HBD_VARIANCE_WXH_10_NEON(64, 16)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 16)
+
+HBD_VARIANCE_WXH_12_NEON(8, 32)
+
+HBD_VARIANCE_WXH_12_NEON(16, 4)
+HBD_VARIANCE_WXH_12_NEON(16, 64)
+
+HBD_VARIANCE_WXH_12_NEON(32, 8)
+
+HBD_VARIANCE_WXH_12_NEON(64, 16)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ uint16x8_t diff = vabdq_u16(s, r);
+
+ sse_u32[0] =
+ vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff));
+ sse_u32[1] =
+ vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff));
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+#define HIGHBD_MSE_WXH_NEON(w, h) \
+ uint32_t aom_highbd_8_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_10_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_12_mse##w##x##h##_neon( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+ return *sse; \
+ }
+
+HIGHBD_MSE_WXH_NEON(16, 16)
+HIGHBD_MSE_WXH_NEON(16, 8)
+HIGHBD_MSE_WXH_NEON(8, 16)
+HIGHBD_MSE_WXH_NEON(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON
+
+static INLINE uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0,
+ uint16x8_t s1, uint16x8_t d0,
+ uint16x8_t d1) {
+ uint16x8_t e0 = vabdq_u16(s0, d0);
+ uint16x8_t e1 = vabdq_u16(s1, d1);
+
+ uint32x4_t mse = vmull_u16(vget_low_u16(e0), vget_low_u16(e0));
+ mse = vmlal_u16(mse, vget_high_u16(e0), vget_high_u16(e0));
+ mse = vmlal_u16(mse, vget_low_u16(e1), vget_low_u16(e1));
+ mse = vmlal_u16(mse, vget_high_u16(e1), vget_high_u16(e1));
+
+ return vpadalq_u32(sum, mse);
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int w,
+ int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ if (w == 8) {
+ do {
+ uint16x8_t d0 = vld1q_u16(dst + 0 * dstride);
+ uint16x8_t d1 = vld1q_u16(dst + 1 * dstride);
+ uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+ uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+ sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+ dst += 2 * dstride;
+ src += 2 * sstride;
+ h -= 2;
+ } while (h != 0);
+ } else { // w == 4
+ do {
+ uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride);
+ uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+ sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+ dst += 4 * dstride;
+ src += 4 * sstride;
+ h -= 4;
+ } while (h != 0);
+ }
+
+ return horizontal_add_u64x2(sum);
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c b/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 0000000000..d56ae97571
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h / 2;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ uint16x8_t s1 = vld1q_u16(src_ptr);
+ src_ptr += src_stride;
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+ uint16x8_t r1 = vld1q_u16(ref_ptr);
+ ref_ptr += ref_stride;
+
+ uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+ } while (--i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return *sse;
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+ uint16x8_t r0 = vld1q_u16(ref_ptr);
+ uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+ uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+ uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return *sse;
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \
+ uint32_t aom_highbd_8_mse##w##x##h##_neon_dotprod( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, \
+ sse); \
+ return *sse; \
+ }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_sve.c b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c
new file mode 100644
index 0000000000..d0058bfa90
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/variance.h"
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h, uint64_t *sse,
+ int64_t *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ do {
+ const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride);
+ const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s64 = aom_sdotq_s16(sse_s64, diff, diff);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ h -= 2;
+ } while (h != 0);
+
+ *sum = vaddlvq_s16(sum_s16);
+ *sse = vaddvq_s64(sse_s64);
+}
+
+static INLINE void variance_8x1_sve(const uint16_t *src, const uint16_t *ref,
+ int32x4_t *sum, int64x2_t *sse) {
+ const uint16x8_t s = vld1q_u16(src);
+ const uint16x8_t r = vld1q_u16(ref);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ *sum = vpadalq_s16(*sum, diff);
+
+ *sse = aom_sdotq_s16(*sse, diff, diff);
+}
+
+static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h, uint64_t *sse,
+ int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ do {
+ variance_8x1_sve(src_ptr, ref_ptr, &sum_s32, &sse_s64);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ *sum = vaddlvq_s32(sum_s32);
+ *sse = vaddvq_s64(sse_s64);
+}
+
+static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ variance_8x1_sve(src_ptr, ref_ptr, &sum_s32[0], &sse_s64[0]);
+ variance_8x1_sve(src_ptr + 8, ref_ptr + 8, &sum_s32[1], &sse_s64[1]);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[1]));
+ *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1]));
+}
+
+static INLINE void highbd_variance_large_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+ int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+ vdupq_n_s64(0) };
+
+ do {
+ int j = 0;
+ do {
+ variance_8x1_sve(src_ptr + j, ref_ptr + j, &sum_s32[0], &sse_s64[0]);
+ variance_8x1_sve(src_ptr + j + 8, ref_ptr + j + 8, &sum_s32[1],
+ &sse_s64[1]);
+ variance_8x1_sve(src_ptr + j + 16, ref_ptr + j + 16, &sum_s32[2],
+ &sse_s64[2]);
+ variance_8x1_sve(src_ptr + j + 24, ref_ptr + j + 24, &sum_s32[3],
+ &sse_s64[3]);
+
+ j += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+ sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]);
+ *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[2]));
+ sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+ sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]);
+ *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[2]));
+}
+
+static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_large_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_large_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+static INLINE void highbd_variance_128xh_sve(const uint16_t *src,
+ int src_stride,
+ const uint16_t *ref,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ highbd_variance_large_sve(src, src_stride, ref, ref_stride, 128, h, sse, sum);
+}
+
+#define HBD_VARIANCE_WXH_8_SVE(w, h) \
+ uint32_t aom_highbd_8_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ sum = (int)sum_long; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \
+ }
+
+#define HBD_VARIANCE_WXH_10_SVE(w, h) \
+ uint32_t aom_highbd_10_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HBD_VARIANCE_WXH_12_SVE(w, h) \
+ uint32_t aom_highbd_12_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_SVE(4, 4)
+HBD_VARIANCE_WXH_8_SVE(4, 8)
+
+HBD_VARIANCE_WXH_8_SVE(8, 4)
+HBD_VARIANCE_WXH_8_SVE(8, 8)
+HBD_VARIANCE_WXH_8_SVE(8, 16)
+
+HBD_VARIANCE_WXH_8_SVE(16, 8)
+HBD_VARIANCE_WXH_8_SVE(16, 16)
+HBD_VARIANCE_WXH_8_SVE(16, 32)
+
+HBD_VARIANCE_WXH_8_SVE(32, 16)
+HBD_VARIANCE_WXH_8_SVE(32, 32)
+HBD_VARIANCE_WXH_8_SVE(32, 64)
+
+HBD_VARIANCE_WXH_8_SVE(64, 32)
+HBD_VARIANCE_WXH_8_SVE(64, 64)
+HBD_VARIANCE_WXH_8_SVE(64, 128)
+
+HBD_VARIANCE_WXH_8_SVE(128, 64)
+HBD_VARIANCE_WXH_8_SVE(128, 128)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_SVE(4, 4)
+HBD_VARIANCE_WXH_10_SVE(4, 8)
+
+HBD_VARIANCE_WXH_10_SVE(8, 4)
+HBD_VARIANCE_WXH_10_SVE(8, 8)
+HBD_VARIANCE_WXH_10_SVE(8, 16)
+
+HBD_VARIANCE_WXH_10_SVE(16, 8)
+HBD_VARIANCE_WXH_10_SVE(16, 16)
+HBD_VARIANCE_WXH_10_SVE(16, 32)
+
+HBD_VARIANCE_WXH_10_SVE(32, 16)
+HBD_VARIANCE_WXH_10_SVE(32, 32)
+HBD_VARIANCE_WXH_10_SVE(32, 64)
+
+HBD_VARIANCE_WXH_10_SVE(64, 32)
+HBD_VARIANCE_WXH_10_SVE(64, 64)
+HBD_VARIANCE_WXH_10_SVE(64, 128)
+
+HBD_VARIANCE_WXH_10_SVE(128, 64)
+HBD_VARIANCE_WXH_10_SVE(128, 128)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_SVE(4, 4)
+HBD_VARIANCE_WXH_12_SVE(4, 8)
+
+HBD_VARIANCE_WXH_12_SVE(8, 4)
+HBD_VARIANCE_WXH_12_SVE(8, 8)
+HBD_VARIANCE_WXH_12_SVE(8, 16)
+
+HBD_VARIANCE_WXH_12_SVE(16, 8)
+HBD_VARIANCE_WXH_12_SVE(16, 16)
+HBD_VARIANCE_WXH_12_SVE(16, 32)
+
+HBD_VARIANCE_WXH_12_SVE(32, 16)
+HBD_VARIANCE_WXH_12_SVE(32, 32)
+HBD_VARIANCE_WXH_12_SVE(32, 64)
+
+HBD_VARIANCE_WXH_12_SVE(64, 32)
+HBD_VARIANCE_WXH_12_SVE(64, 64)
+HBD_VARIANCE_WXH_12_SVE(64, 128)
+
+HBD_VARIANCE_WXH_12_SVE(128, 64)
+HBD_VARIANCE_WXH_12_SVE(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+// 8-bit
+HBD_VARIANCE_WXH_8_SVE(4, 16)
+
+HBD_VARIANCE_WXH_8_SVE(8, 32)
+
+HBD_VARIANCE_WXH_8_SVE(16, 4)
+HBD_VARIANCE_WXH_8_SVE(16, 64)
+
+HBD_VARIANCE_WXH_8_SVE(32, 8)
+
+HBD_VARIANCE_WXH_8_SVE(64, 16)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_SVE(4, 16)
+
+HBD_VARIANCE_WXH_10_SVE(8, 32)
+
+HBD_VARIANCE_WXH_10_SVE(16, 4)
+HBD_VARIANCE_WXH_10_SVE(16, 64)
+
+HBD_VARIANCE_WXH_10_SVE(32, 8)
+
+HBD_VARIANCE_WXH_10_SVE(64, 16)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_SVE(4, 16)
+
+HBD_VARIANCE_WXH_12_SVE(8, 32)
+
+HBD_VARIANCE_WXH_12_SVE(16, 4)
+HBD_VARIANCE_WXH_12_SVE(16, 64)
+
+HBD_VARIANCE_WXH_12_SVE(32, 8)
+
+HBD_VARIANCE_WXH_12_SVE(64, 16)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef HBD_VARIANCE_WXH_8_SVE
+#undef HBD_VARIANCE_WXH_10_SVE
+#undef HBD_VARIANCE_WXH_12_SVE
+
+static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ unsigned int *sse) {
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ uint16x8_t diff = vabdq_u16(s, r);
+
+ sse_u64 = aom_udotq_u16(sse_u64, diff, diff);
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ *sse = (uint32_t)vaddvq_u64(sse_u64);
+ return *sse;
+}
+
+#define HIGHBD_MSE_WXH_SVE(w, h) \
+ uint32_t aom_highbd_8_mse##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_10_mse##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+ return *sse; \
+ } \
+ \
+ uint32_t aom_highbd_12_mse##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \
+ *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+ return *sse; \
+ }
+
+HIGHBD_MSE_WXH_SVE(16, 16)
+HIGHBD_MSE_WXH_SVE(16, 8)
+HIGHBD_MSE_WXH_SVE(8, 16)
+HIGHBD_MSE_WXH_SVE(8, 8)
+
+#undef HIGHBD_MSE_WXH_SVE
+
+uint64_t aom_mse_wxh_16bit_highbd_sve(uint16_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ if (w == 8) {
+ do {
+ uint16x8_t d0 = vld1q_u16(dst + 0 * dstride);
+ uint16x8_t d1 = vld1q_u16(dst + 1 * dstride);
+ uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+ uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+ uint16x8_t abs_diff0 = vabdq_u16(s0, d0);
+ uint16x8_t abs_diff1 = vabdq_u16(s1, d1);
+
+ sum = aom_udotq_u16(sum, abs_diff0, abs_diff0);
+ sum = aom_udotq_u16(sum, abs_diff1, abs_diff1);
+
+ dst += 2 * dstride;
+ src += 2 * sstride;
+ h -= 2;
+ } while (h != 0);
+ } else { // w == 4
+ do {
+ uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride);
+ uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+ uint16x8_t abs_diff0 = vabdq_u16(s0, d0);
+ uint16x8_t abs_diff1 = vabdq_u16(s1, d1);
+
+ sum = aom_udotq_u16(sum, abs_diff0, abs_diff0);
+ sum = aom_udotq_u16(sum, abs_diff1, abs_diff1);
+
+ dst += 4 * dstride;
+ src += 4 * sstride;
+ h -= 4;
+ } while (h != 0);
+ }
+
+ return vaddvq_u64(sum);
+}
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
new file mode 100644
index 0000000000..d8dc60c1fe
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@@ -0,0 +1,3110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/reinterpret_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/intrapred_common.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
+ const uint8x8_t a = load_u8_4x1(in);
+ const uint16x4_t p0 = vpaddl_u8(a);
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ return vcombine_u16(p1, vdup_n_u16(0));
+}
+
+static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ store_u8_4x1(dst + i * stride, dc);
+ }
+}
+
+void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_4(above);
+ const uint16x8_t sum_left = dc_load_sum_4(left);
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
+ dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_left = dc_load_sum_4(left);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2);
+ (void)above;
+ dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_4(above);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2);
+ (void)left;
+ dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t dc0 = vdup_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_4xh(dst, stride, 4, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16x8_t dc_load_sum_8(const uint8_t *in) {
+ // This isn't used in the case where we want to load both above and left
+ // vectors, since we want to avoid performing the reduction twice.
+ const uint8x8_t a = vld1_u8(in);
+ const uint16x4_t p0 = vpaddl_u8(a);
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ const uint16x4_t p2 = vpadd_u16(p1, p1);
+ return vcombine_u16(p2, vdup_n_u16(0));
+}
+
+static INLINE uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
+#if AOM_ARCH_AARCH64
+ // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an
+ // instruction, however the addv instruction is usually slightly more
+ // expensive than a pairwise addition, so the need for immediately
+ // broadcasting the result again seems to negate any benefit.
+ const uint16x8_t b = vpaddq_u16(a, a);
+ const uint16x8_t c = vpaddq_u16(b, b);
+ return vpaddq_u16(c, c);
+#else
+ const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a));
+ const uint16x4_t c = vpadd_u16(b, b);
+ const uint16x4_t d = vpadd_u16(c, c);
+ return vcombine_u16(d, d);
+#endif
+}
+
+static INLINE void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1_u8(dst + i * stride, dc);
+ }
+}
+
+void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t sum_top = vld1_u8(above);
+ const uint8x8_t sum_left = vld1_u8(left);
+ uint16x8_t sum = vaddl_u8(sum_left, sum_top);
+ sum = horizontal_add_and_broadcast_u16x8(sum);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 4);
+ dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_left = dc_load_sum_8(left);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3);
+ (void)above;
+ dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_8(above);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3);
+ (void)left;
+ dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t dc0 = vdup_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_8xh(dst, stride, 8, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
+ const uint8x16_t a = vld1q_u8(in);
+ // delay the remainder of the reduction until
+ // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
+ // than twice in the case we are loading both above and left.
+ return vpaddlq_u8(a);
+}
+
+static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) {
+ return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
+}
+
+static INLINE void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ }
+}
+
+void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_partial_sum_16(above);
+ const uint16x8_t sum_left = dc_load_partial_sum_16(left);
+ uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ sum = horizontal_add_and_broadcast_u16x8(sum);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 5);
+ dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_left = dc_load_sum_16(left);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4);
+ (void)above;
+ dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_16(above);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4);
+ (void)left;
+ dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint8x16_t dc0 = vdupq_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 16, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
+ const uint8x16_t a0 = vld1q_u8(in);
+ const uint8x16_t a1 = vld1q_u8(in + 16);
+ // delay the remainder of the reduction until
+ // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
+ // than twice in the case we are loading both above and left.
+ return vpadalq_u8(vpaddlq_u8(a0), a1);
+}
+
+static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) {
+ return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
+}
+
+static INLINE void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ vst1q_u8(dst + i * stride + 16, dc);
+ }
+}
+
+void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_partial_sum_32(above);
+ const uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ sum = horizontal_add_and_broadcast_u16x8(sum);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 6);
+ dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_left = dc_load_sum_32(left);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5);
+ (void)above;
+ dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_32(above);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5);
+ (void)left;
+ dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint8x16_t dc0 = vdupq_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_32xh(dst, stride, 32, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 64x64
+
+static INLINE uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
+ const uint8x16_t a0 = vld1q_u8(in);
+ const uint8x16_t a1 = vld1q_u8(in + 16);
+ const uint8x16_t a2 = vld1q_u8(in + 32);
+ const uint8x16_t a3 = vld1q_u8(in + 48);
+ const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1);
+ const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3);
+ // delay the remainder of the reduction until
+ // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
+ // than twice in the case we are loading both above and left.
+ return vaddq_u16(p01, p23);
+}
+
+static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) {
+ return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
+}
+
+static INLINE void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t dc) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ vst1q_u8(dst + i * stride + 16, dc);
+ vst1q_u8(dst + i * stride + 32, dc);
+ vst1q_u8(dst + i * stride + 48, dc);
+ }
+}
+
+void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_partial_sum_64(above);
+ const uint16x8_t sum_left = dc_load_partial_sum_64(left);
+ uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ sum = horizontal_add_and_broadcast_u16x8(sum);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, 7);
+ dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_left = dc_load_sum_64(left);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6);
+ (void)above;
+ dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint16x8_t sum_top = dc_load_sum_64(above);
+ const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6);
+ (void)left;
+ dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const uint8x16_t dc0 = vdupq_n_u8(0x80);
+ (void)above;
+ (void)left;
+ dc_store_64xh(dst, stride, 64, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC rectangular cases
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+ int multiplier, int shift2) {
+ const int interm = num >> shift1;
+ return interm * multiplier >> shift2;
+}
+
+static INLINE int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
+ int shift1, int multiplier) {
+ const int expected_dc = divide_using_multiply_shift(
+ sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+ assert(expected_dc < (1 << 8));
+ return expected_dc;
+}
+
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = load_u8_4x1(above);
+ uint8x8_t l = vld1_u8(left);
+ uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
+ uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
+ dc_store_4xh(dst, stride, 8, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = vld1_u8(above);
+ uint8x8_t l = load_u8_4x1(left);
+ uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
+ uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
+ dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = load_u8_4x1(above);
+ uint8x16_t l = vld1q_u8(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4);
+ dc_store_4xh(dst, stride, 16, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t a = vld1q_u8(above);
+ uint8x8_t l = load_u8_4x1(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
+ dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = vld1_u8(above);
+ uint8x16_t l = vld1q_u8(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2);
+ dc_store_8xh(dst, stride, 16, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t a = vld1q_u8(above);
+ uint8x8_t l = vld1_u8(left);
+ uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2);
+ dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t a = vld1_u8(above);
+ uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum_al = vaddw_u8(sum_left, a);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4);
+ dc_store_8xh(dst, stride, 32, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_top = dc_load_partial_sum_32(above);
+ uint8x8_t l = vld1_u8(left);
+ uint16x8_t sum_al = vaddw_u8(sum_top, l);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4);
+ dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_16(above);
+ uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2);
+ dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_32(above);
+ uint16x8_t sum_left = dc_load_partial_sum_16(left);
+ uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2);
+ dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_16(above);
+ uint16x8_t sum_left = dc_load_partial_sum_64(left);
+ uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4);
+ dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_64(above);
+ uint16x8_t sum_left = dc_load_partial_sum_16(left);
+ uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4);
+ dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_32(above);
+ uint16x8_t sum_left = dc_load_partial_sum_64(left);
+ uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2);
+ dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint16x8_t sum_above = dc_load_partial_sum_64(above);
+ uint16x8_t sum_left = dc_load_partial_sum_32(left);
+ uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+ uint32_t sum = horizontal_add_u16x8(sum_al);
+ uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2);
+ dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc));
+}
+
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
+#define DC_PREDICTOR_128(w, h, q) \
+ void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ (void)above; \
+ (void)left; \
+ dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80)); \
+ }
+
+DC_PREDICTOR_128(4, 8, )
+DC_PREDICTOR_128(4, 16, )
+DC_PREDICTOR_128(8, 4, )
+DC_PREDICTOR_128(8, 16, )
+DC_PREDICTOR_128(8, 32, )
+DC_PREDICTOR_128(16, 4, q)
+DC_PREDICTOR_128(16, 8, q)
+DC_PREDICTOR_128(16, 32, q)
+DC_PREDICTOR_128(16, 64, q)
+DC_PREDICTOR_128(32, 8, q)
+DC_PREDICTOR_128(32, 16, q)
+DC_PREDICTOR_128(32, 64, q)
+DC_PREDICTOR_128(64, 32, q)
+DC_PREDICTOR_128(64, 16, q)
+
+#undef DC_PREDICTOR_128
+
+#define DC_PREDICTOR_LEFT(w, h, shift, q) \
+ void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ (void)above; \
+ const uint16x8_t sum = dc_load_sum_##h(left); \
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \
+ dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \
+ }
+
+DC_PREDICTOR_LEFT(4, 8, 3, )
+DC_PREDICTOR_LEFT(8, 4, 2, )
+DC_PREDICTOR_LEFT(8, 16, 4, )
+DC_PREDICTOR_LEFT(16, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 32, 5, q)
+DC_PREDICTOR_LEFT(32, 16, 4, q)
+DC_PREDICTOR_LEFT(32, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 32, 5, q)
+DC_PREDICTOR_LEFT(4, 16, 4, )
+DC_PREDICTOR_LEFT(16, 4, 2, q)
+DC_PREDICTOR_LEFT(8, 32, 5, )
+DC_PREDICTOR_LEFT(32, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 16, 4, q)
+
+#undef DC_PREDICTOR_LEFT
+
+#define DC_PREDICTOR_TOP(w, h, shift, q) \
+ void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ (void)left; \
+ const uint16x8_t sum = dc_load_sum_##w(above); \
+ const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \
+ dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \
+ }
+
+DC_PREDICTOR_TOP(4, 8, 2, )
+DC_PREDICTOR_TOP(4, 16, 2, )
+DC_PREDICTOR_TOP(8, 4, 3, )
+DC_PREDICTOR_TOP(8, 16, 3, )
+DC_PREDICTOR_TOP(8, 32, 3, )
+DC_PREDICTOR_TOP(16, 4, 4, q)
+DC_PREDICTOR_TOP(16, 8, 4, q)
+DC_PREDICTOR_TOP(16, 32, 4, q)
+DC_PREDICTOR_TOP(16, 64, 4, q)
+DC_PREDICTOR_TOP(32, 8, 5, q)
+DC_PREDICTOR_TOP(32, 16, 5, q)
+DC_PREDICTOR_TOP(32, 64, 5, q)
+DC_PREDICTOR_TOP(64, 16, 6, q)
+DC_PREDICTOR_TOP(64, 32, 6, q)
+
+#undef DC_PREDICTOR_TOP
+
+// -----------------------------------------------------------------------------
+
+static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t d0) {
+ for (int i = 0; i < h; ++i) {
+ store_u8_4x1(dst + i * stride, d0);
+ }
+}
+
+static INLINE void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x8_t d0) {
+ for (int i = 0; i < h; ++i) {
+ vst1_u8(dst + i * stride, d0);
+ }
+}
+
+static INLINE void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t d0) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + i * stride, d0);
+ }
+}
+
+static INLINE void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t d0, uint8x16_t d1) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + 0, d0);
+ vst1q_u8(dst + 16, d1);
+ dst += stride;
+ }
+}
+
+static INLINE void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+ uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
+ uint8x16_t d3) {
+ for (int i = 0; i < h; ++i) {
+ vst1q_u8(dst + 0, d0);
+ vst1q_u8(dst + 16, d1);
+ vst1q_u8(dst + 32, d2);
+ vst1q_u8(dst + 48, d3);
+ dst += stride;
+ }
+}
+
+void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_4xh(dst, stride, 4, load_u8_4x1(above));
+}
+
+void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 8, vld1_u8(above));
+}
+
+void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 16, vld1q_u8(above));
+}
+
+void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 32, d0, d1);
+}
+
+void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_4xh(dst, stride, 8, load_u8_4x1(above));
+}
+
+void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_4xh(dst, stride, 16, load_u8_4x1(above));
+}
+
+void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 4, vld1_u8(above));
+}
+
+void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 16, vld1_u8(above));
+}
+
+void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_8xh(dst, stride, 32, vld1_u8(above));
+}
+
+void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 4, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 8, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 32, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_store_16xh(dst, stride, 64, vld1q_u8(above));
+}
+
+void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 8, d0, d1);
+}
+
+void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 16, d0, d1);
+}
+
+void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ (void)left;
+ v_store_32xh(dst, stride, 64, d0, d1);
+}
+
+void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ const uint8x16_t d2 = vld1q_u8(above + 32);
+ const uint8x16_t d3 = vld1q_u8(above + 48);
+ (void)left;
+ v_store_64xh(dst, stride, 16, d0, d1, d2, d3);
+}
+
+void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ const uint8x16_t d2 = vld1q_u8(above + 32);
+ const uint8x16_t d3 = vld1q_u8(above + 48);
+ (void)left;
+ v_store_64xh(dst, stride, 32, d0, d1, d2, d3);
+}
+
+void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(above);
+ const uint8x16_t d1 = vld1q_u8(above + 16);
+ const uint8x16_t d2 = vld1q_u8(above + 32);
+ const uint8x16_t d3 = vld1q_u8(above + 48);
+ (void)left;
+ v_store_64xh(dst, stride, 64, d0, d1, d2, d3);
+}
+
+// -----------------------------------------------------------------------------
+
+static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+ store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
+ store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
+ store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
+ store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
+ store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4));
+ store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5));
+ store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6));
+ store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+ vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
+ vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
+ vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
+ vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
+ vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4));
+ vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5));
+ vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6));
+ vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+ vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
+ vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4));
+ vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5));
+ vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6));
+ vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6));
+ dst += stride;
+ vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
+ vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
+ vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7));
+ vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7));
+}
+
+void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = load_u8_4x1(left);
+ (void)above;
+ store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
+ store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
+ store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
+ store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = vld1_u8(left);
+ (void)above;
+ h_store_8x8(dst, stride, d0);
+}
+
+void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ (void)above;
+ h_store_16x8(dst, stride, vget_low_u8(d0));
+ h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ const uint8x16_t d1 = vld1q_u8(left + 16);
+ (void)above;
+ h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
+ h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = vld1_u8(left);
+ (void)above;
+ h_store_4x8(dst, stride, d0);
+}
+
+void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ (void)above;
+ h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = load_u8_4x1(left);
+ (void)above;
+ vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
+ vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
+ vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
+ vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ (void)above;
+ h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ const uint8x16_t d1 = vld1q_u8(left + 16);
+ (void)above;
+ h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1));
+ h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = load_u8_4x1(left);
+ (void)above;
+ vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
+ vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
+ vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
+ vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = vld1_u8(left);
+ (void)above;
+ h_store_16x8(dst, stride, d0);
+}
+
+void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ const uint8x16_t d1 = vld1q_u8(left + 16);
+ (void)above;
+ h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
+ h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ const uint8x16_t d1 = vld1q_u8(left + 16);
+ const uint8x16_t d2 = vld1q_u8(left + 32);
+ const uint8x16_t d3 = vld1q_u8(left + 48);
+ (void)above;
+ h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
+ h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
+ h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2));
+ h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2));
+ h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3));
+ h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3));
+}
+
+void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t d0 = vld1_u8(left);
+ (void)above;
+ h_store_32x8(dst, stride, d0);
+}
+
+void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ (void)above;
+ h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left + 0);
+ const uint8x16_t d1 = vld1q_u8(left + 16);
+ const uint8x16_t d2 = vld1q_u8(left + 32);
+ const uint8x16_t d3 = vld1q_u8(left + 48);
+ (void)above;
+ h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
+ h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
+ h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2));
+ h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2));
+ h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3));
+ h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3));
+}
+
+void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ (void)above;
+ h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ for (int i = 0; i < 2; ++i) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ left += 16;
+ dst += 16 * stride;
+ }
+}
+
+void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ for (int i = 0; i < 4; ++i) {
+ const uint8x16_t d0 = vld1q_u8(left);
+ h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+ h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+ left += 16;
+ dst += 16 * stride;
+ }
+}
+
+/* ---------------------P R E D I C T I O N Z 1--------------------------- */
+
+// Low bit depth functions
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
+ int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
+ int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((W + H) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+ const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]);
+
+ int x = dx;
+ for (int r = 0; r < W; r++) {
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base) >> upsample_above;
+ if (base_max_diff <= 0) {
+ for (int i = r; i < W; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+
+ if (base_max_diff > H) base_max_diff = H;
+
+ uint8x8x2_t a01_128;
+ uint16x8_t shift;
+ if (upsample_above) {
+ a01_128 = vld2_u8(above + base);
+ shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1);
+ } else {
+ a01_128.val[0] = vld1_u8(above + base);
+ a01_128.val[1] = vld1_u8(above + base + 1);
+ shift = vdupq_n_u16((x & 0x3f) >> 1);
+ }
+ uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]);
+ uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32));
+ uint16x8_t res = vmlaq_u16(a32, diff, shift);
+
+ uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
+ dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x);
+
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ uint8x8_t dstvec[16];
+
+ dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
+ dx);
+ for (int i = 0; i < N; i++) {
+ vst1_lane_u32((uint32_t *)(dst + stride * i),
+ vreinterpret_u32_u8(dstvec[i]), 0);
+ }
+}
+
+static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ uint8x8_t dstvec[32];
+
+ dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
+ dx);
+ for (int i = 0; i < N; i++) {
+ vst1_u8(dst + stride * i, dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
+ int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
+ int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((W + H) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+ const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
+
+ int x = dx;
+ for (int r = 0; r < W; r++) {
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base) >> upsample_above;
+ if (base_max_diff <= 0) {
+ for (int i = r; i < W; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+
+ if (base_max_diff > H) base_max_diff = H;
+
+ uint16x8_t shift;
+ uint8x16_t a0_128, a1_128;
+ if (upsample_above) {
+ uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
+ a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
+ a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8);
+ shift = vdupq_n_u16(x & 0x1f);
+ } else {
+ a0_128 = vld1q_u8(above + base);
+ a1_128 = vld1q_u8(above + base + 1);
+ shift = vdupq_n_u16((x & 0x3f) >> 1);
+ }
+ uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
+ uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
+ uint16x8_t a32_lo =
+ vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t a32_hi =
+ vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
+ uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
+ uint8x16_t v_temp =
+ vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
+
+ uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
+ dst[r] = vbslq_u8(mask, v_temp, a_mbase_x);
+
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ uint8x16_t dstvec[64];
+
+ dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ vst1q_u8(dst + stride * i, dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
+ int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) {
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+ const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base);
+ if (base_max_diff <= 0) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i].val[0] = a_mbase_x; // save 32 values
+ dstvec[i].val[1] = a_mbase_x;
+ }
+ return;
+ }
+ if (base_max_diff > 32) base_max_diff = 32;
+
+ uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
+
+ uint8x16_t res16[2];
+ for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+ int mdiff = base_max_diff - j;
+ if (mdiff <= 0) {
+ res16[jj] = a_mbase_x;
+ } else {
+ uint8x16_t a0_128 = vld1q_u8(above + base + j);
+ uint8x16_t a1_128 = vld1q_u8(above + base + j + 1);
+ uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
+ uint16x8_t diff_hi =
+ vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
+ uint16x8_t a32_lo =
+ vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t a32_hi =
+ vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
+ uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
+
+ res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
+ }
+ }
+
+ uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]);
+ uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16);
+ dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x);
+ dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x);
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int dx) {
+ uint8x16x2_t dstvec[64];
+
+ dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx);
+ for (int i = 0; i < N; i++) {
+ vst1q_u8(dst + stride * i, dstvec[i].val[0]);
+ vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
+ }
+}
+
+static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int dx) {
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+ const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
+ const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x);
+
+ int x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ vst1q_u8(dst, a_mbase_x);
+ vst1q_u8(dst + 16, a_mbase_x);
+ vst1q_u8(dst + 32, a_mbase_x);
+ vst1q_u8(dst + 48, a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
+ uint8x16_t base_inc128 =
+ vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
+ vcreate_u8(0x0F0E0D0C0B0A0908)));
+
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ vst1q_u8(dst + j, a_mbase_x);
+ } else {
+ uint8x16_t a0_128 = vld1q_u8(above + base + j);
+ uint8x16_t a1_128 = vld1q_u8(above + base + 1 + j);
+ uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
+ uint16x8_t diff_hi =
+ vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
+ uint16x8_t a32_lo =
+ vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t a32_hi =
+ vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
+ uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
+ uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
+ uint8x16_t v_temp =
+ vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
+
+ uint8x16_t mask128 =
+ vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), vdupq_n_u8(0));
+ uint8x16_t res128 = vbslq_u8(mask128, v_temp, a_mbase_x);
+ vst1q_u8(dst + j, res128);
+
+ base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
+ }
+ }
+ x += dx;
+ }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int dx, int dy) {
+ (void)left;
+ (void)dy;
+
+ switch (bw) {
+ case 4:
+ dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 8:
+ dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 16:
+ dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break;
+ case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break;
+ default: break;
+ }
+}
+
+/* ---------------------P R E D I C T I O N Z 2--------------------------- */
+
+#if !AOM_ARCH_AARCH64
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff }
+};
+#endif // !AOM_ARCH_AARCH64
+
+static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon(
+ const uint8_t *above, int upsample_above, int dx, int base_x, int y,
+ uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) {
+ uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
+ uint16x4_t ydx = vdup_n_u16(y * dx);
+ if (upsample_above) {
+ // Cannot use LD2 here since we only want to load eight bytes, but LD2 can
+ // only load either 16 or 32.
+ uint8x8_t v_tmp = vld1_u8(above + base_x);
+ *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0];
+ *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
+ *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
+ } else {
+ *a0_x = load_u8_4x1(above + base_x);
+ *a1_x = load_u8_4x1(above + base_x + 1);
+ *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon(
+#if AOM_ARCH_AARCH64
+ uint8x16x2_t left_vals,
+#else
+ const uint8_t *left,
+#endif
+ int upsample_left, int dy, int r, int min_base_y, int frac_bits_y,
+ uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) {
+ int16x4_t dy64 = vdup_n_s16(dy);
+ int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
+ int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
+ int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
+ int16x4_t v_r6 = vdup_n_s16(r << 6);
+ int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
+ int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
+
+ // Values in base_y_c64 range from -2 through 14 inclusive.
+ base_y_c64 = vmax_s16(base_y_c64, min_base_y64);
+
+#if AOM_ARCH_AARCH64
+ uint8x8_t left_idx0 =
+ vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16]
+ uint8x8_t left_idx1 =
+ vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17]
+
+ *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0));
+ *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1));
+#else // !AOM_ARCH_AARCH64
+ DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
+
+ vst1_s16(base_y_c, base_y_c64);
+ uint8x8_t a0_y_u8 = vdup_n_u8(0);
+ a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0);
+ a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2);
+ a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4);
+ a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6);
+
+ base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1));
+ vst1_s16(base_y_c, base_y_c64);
+ uint8x8_t a1_y_u8 = vdup_n_u8(0);
+ a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0);
+ a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2);
+ a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4);
+ a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6);
+
+ *a0_y = vreinterpret_u16_u8(a0_y_u8);
+ *a1_y = vreinterpret_u16_u8(a1_y_u8);
+#endif // AOM_ARCH_AARCH64
+
+ if (upsample_left) {
+ *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f));
+ } else {
+ *shift1 =
+ vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f));
+ }
+}
+
+static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon(
+ const uint8_t *above, int upsample_above, int dx, int base_x, int y) {
+ uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
+ vcreate_u16(0x0008000700060005));
+ uint16x8_t ydx = vdupq_n_u16(y * dx);
+ uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6);
+
+ uint16x8_t shift0;
+ uint8x8_t a0_x0;
+ uint8x8_t a1_x0;
+ if (upsample_above) {
+ uint8x8x2_t v_tmp = vld2_u8(above + base_x);
+ a0_x0 = v_tmp.val[0];
+ a1_x0 = v_tmp.val[1];
+ shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
+ } else {
+ a0_x0 = vld1_u8(above + base_x);
+ a1_x0 = vld1_u8(above + base_x + 1);
+ shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
+ }
+
+ uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x]
+ uint16x8_t a32 =
+ vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16
+ uint16x8_t res = vmlaq_u16(a32, diff0, shift0);
+ return vshrn_n_u16(res, 5);
+}
+
+static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon(
+#if AOM_ARCH_AARCH64
+ uint8x16x3_t left_vals,
+#else
+ const uint8_t *left,
+#endif
+ int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) {
+ int16x8_t v_r6 = vdupq_n_s16(r << 6);
+ int16x8_t dy128 = vdupq_n_s16(dy);
+ int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
+ int16x8_t min_base_y128 = vdupq_n_s16(min_base_y);
+
+ uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
+ vcreate_u16(0x0008000700060005));
+ int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
+ int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
+
+ // Values in base_y_c128 range from -2 through 31 inclusive.
+ base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128);
+
+#if AOM_ARCH_AARCH64
+ uint8x16_t left_idx0 =
+ vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33]
+ uint8x16_t left_idx1 =
+ vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34]
+ uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
+
+ uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
+ uint8x8_t a0_x1 = vget_low_u8(a01_x);
+ uint8x8_t a1_x1 = vget_high_u8(a01_x);
+#else // !AOM_ARCH_AARCH64
+ uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128);
+ uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128);
+#endif // AOM_ARCH_AARCH64
+
+ uint16x8_t shift1;
+ if (upsample_left) {
+ shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f));
+ } else {
+ shift1 = vshrq_n_u16(
+ vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1);
+ }
+
+ uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1);
+ uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32));
+ uint16x8_t res = vmlaq_u16(a32, diff1, shift1);
+ return vshrn_n_u16(res, 5);
+}
+
+static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon(
+ const uint8_t *above, int dx, int base_x, int y, int j) {
+ uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
+ vcreate_u16(0x0007000600050004)),
+ vcombine_u16(vcreate_u16(0x000B000A00090008),
+ vcreate_u16(0x000F000E000D000C)) } };
+ uint16x8_t j256 = vdupq_n_u16(j);
+ uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx));
+
+ const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j);
+ const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1);
+ uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
+ uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
+ uint16x8_t shift0 =
+ vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1);
+ uint16x8_t shift1 =
+ vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1);
+ // a[x+1] - a[x]
+ uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128));
+ uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128));
+ // a[x] * 32 + 16
+ uint16x8_t a32_0 =
+ vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32));
+ uint16x8_t a32_1 =
+ vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32));
+ uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0);
+ uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1);
+ return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
+}
+
+static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon(
+#if AOM_ARCH_AARCH64
+ uint8x16x4_t left_vals0, uint8x16x4_t left_vals1,
+#else
+ const uint8_t *left,
+#endif
+ int dy, int r, int j) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_y = -1;
+
+ int16x8_t min_base_y256 = vdupq_n_s16(min_base_y);
+ int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1);
+ int16x8_t dy256 = vdupq_n_s16(dy);
+ uint16x8_t j256 = vdupq_n_u16(j);
+
+ uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
+ vcreate_u16(0x0007000600050004)),
+ vcombine_u16(vcreate_u16(0x000B000A00090008),
+ vcreate_u16(0x000F000E000D000C)) } };
+ uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)),
+ vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } };
+
+ int16x8_t v_r6 = vdupq_n_s16(r << 6);
+
+ int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0]));
+ int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1]));
+ int16x8_t mul16_lo = vreinterpretq_s16_u16(
+ vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)),
+ vreinterpretq_u16_s16(half_min_base_y256)));
+ int16x8_t mul16_hi = vreinterpretq_s16_u16(
+ vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)),
+ vreinterpretq_u16_s16(half_min_base_y256)));
+ int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo);
+ int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi);
+
+ int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6);
+ int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6);
+
+ base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo);
+ base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi);
+
+#if !AOM_ARCH_AARCH64
+ int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7);
+ int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0);
+ int16_t offset_diff = max_y - min_y;
+
+ uint8x8_t a0_y0;
+ uint8x8_t a0_y1;
+ uint8x8_t a1_y0;
+ uint8x8_t a1_y1;
+ if (offset_diff < 16) {
+ // Avoid gathers where the data we want is close together in memory.
+ // We don't need this for AArch64 since we can already use TBL to cover the
+ // full range of possible values.
+ assert(offset_diff >= 0);
+ int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3);
+
+ int16x8x2_t base_y_offset;
+ base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256);
+ base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256);
+
+ int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
+ vqmovn_s16(base_y_offset.val[1]));
+
+ uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
+ uint8x16_t a0_y128 = vld1q_u8(left + min_y);
+ uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1);
+ a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
+ a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
+
+ uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
+ uint8x8_t v_index_high =
+ vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
+ uint8x8x2_t v_tmp, v_res;
+ v_tmp.val[0] = vget_low_u8(a0_y128);
+ v_tmp.val[1] = vget_high_u8(a0_y128);
+ v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
+ v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
+ a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
+ v_tmp.val[0] = vget_low_u8(a1_y128);
+ v_tmp.val[1] = vget_high_u8(a1_y128);
+ v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
+ v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
+ a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
+
+ a0_y0 = vget_low_u8(a0_y128);
+ a0_y1 = vget_high_u8(a0_y128);
+ a1_y0 = vget_low_u8(a1_y128);
+ a1_y1 = vget_high_u8(a1_y128);
+ } else {
+ a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo);
+ a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi);
+ a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo);
+ a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi);
+ }
+#else
+ // Values in left_idx{0,1} range from 0 through 63 inclusive.
+ uint8x16_t left_idx0 =
+ vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1)));
+ uint8x16_t left_idx1 =
+ vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1)));
+ uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
+
+ uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01);
+ uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01);
+
+ uint8x8_t a0_y0 = vget_low_u8(a0_y01);
+ uint8x8_t a0_y1 = vget_high_u8(a0_y01);
+ uint8x8_t a1_y0 = vget_low_u8(a1_y01);
+ uint8x8_t a1_y1 = vget_high_u8(a1_y01);
+#endif // !AOM_ARCH_AARCH64
+
+ uint16x8_t shifty_lo = vshrq_n_u16(
+ vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1);
+ uint16x8_t shifty_hi = vshrq_n_u16(
+ vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1);
+
+ // a[x+1] - a[x]
+ uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0);
+ uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1);
+ // a[x] * 32 + 16
+ uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32));
+ uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32));
+
+ uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo);
+ uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi);
+
+ return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
+}
+
+static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left,
+ int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+#if AOM_ARCH_AARCH64
+ // Use ext rather than loading left + 14 directly to avoid over-read.
+ const uint8x16_t left_m2 = vld1q_u8(left - 2);
+ const uint8x16_t left_0 = vld1q_u8(left);
+ const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14);
+ const uint8x16x2_t left_vals = { { left_m2, left_14 } };
+#define LEFT left_vals
+#else // !AOM_ARCH_AARCH64
+#define LEFT left
+#endif // AOM_ARCH_AARCH64
+
+ for (int r = 0; r < N; r++) {
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ const int base_min_diff =
+ (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >>
+ upsample_above;
+
+ if (base_min_diff <= 0) {
+ uint8x8_t a0_x_u8, a1_x_u8;
+ uint16x4_t shift0;
+ dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
+ &a0_x_u8, &a1_x_u8, &shift0);
+ uint8x8_t a0_x = a0_x_u8;
+ uint8x8_t a1_x = a1_x_u8;
+
+ uint16x8_t diff = vsubl_u8(a1_x, a0_x); // a[x+1] - a[x]
+ uint16x8_t a32 =
+ vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32)); // a[x] * 32 + 16
+ uint16x8_t res =
+ vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0)));
+ uint8x8_t resx = vshrn_n_u16(res, 5);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0);
+ } else if (base_min_diff < 4) {
+ uint8x8_t a0_x_u8, a1_x_u8;
+ uint16x4_t shift0;
+ dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
+ &a0_x_u8, &a1_x_u8, &shift0);
+ uint16x8_t a0_x = vmovl_u8(a0_x_u8);
+ uint16x8_t a1_x = vmovl_u8(a1_x_u8);
+
+ uint16x4_t a0_y;
+ uint16x4_t a1_y;
+ uint16x4_t shift1;
+ dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
+ frac_bits_y, &a0_y, &a1_y, &shift1);
+ a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y);
+ a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y);
+
+ uint16x8_t shift = vcombine_u16(shift0, shift1);
+ uint16x8_t diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x]
+ uint16x8_t a32 =
+ vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32); // a[x] * 32 + 16
+ uint16x8_t res = vmlaq_u16(a32, diff, shift);
+ uint8x8_t resx = vshrn_n_u16(res, 5);
+ uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4);
+
+ uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
+ uint8x8_t v_resxy = vbsl_u8(mask, resy, resx);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
+ } else {
+ uint16x4_t a0_y, a1_y;
+ uint16x4_t shift1;
+ dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
+ frac_bits_y, &a0_y, &a1_y, &shift1);
+ uint16x4_t diff = vsub_u16(a1_y, a0_y); // a[x+1] - a[x]
+ uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32); // a[x] * 32 + 16
+ uint16x4_t res = vmla_u16(a32, diff, shift1);
+ uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5);
+
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0);
+ }
+
+ dst += stride;
+ }
+#undef LEFT
+}
+
+static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left,
+ int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+#if AOM_ARCH_AARCH64
+ // Use ext rather than loading left + 30 directly to avoid over-read.
+ const uint8x16_t left_m2 = vld1q_u8(left - 2);
+ const uint8x16_t left_0 = vld1q_u8(left + 0);
+ const uint8x16_t left_16 = vld1q_u8(left + 16);
+ const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14);
+ const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14);
+ const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } };
+#define LEFT left_vals
+#else // !AOM_ARCH_AARCH64
+#define LEFT left
+#endif // AOM_ARCH_AARCH64
+
+ for (int r = 0; r < N; r++) {
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+
+ if (base_min_diff <= 0) {
+ uint8x8_t resx =
+ dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
+ vst1_u8(dst, resx);
+ } else if (base_min_diff < 8) {
+ uint8x8_t resx =
+ dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
+ uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
+ LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
+ uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
+ uint8x8_t resxy = vbsl_u8(mask, resy, resx);
+ vst1_u8(dst, resxy);
+ } else {
+ uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
+ LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
+ vst1_u8(dst, resy);
+ }
+
+ dst += stride;
+ }
+#undef LEFT
+}
+
+static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
+ ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int dx, int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+
+#if AOM_ARCH_AARCH64
+ const uint8x16_t left_m1 = vld1q_u8(left - 1);
+ const uint8x16_t left_0 = vld1q_u8(left + 0);
+ const uint8x16_t left_16 = vld1q_u8(left + 16);
+ const uint8x16_t left_32 = vld1q_u8(left + 32);
+ const uint8x16_t left_48 = vld1q_u8(left + 48);
+ const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15);
+ const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15);
+ const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15);
+ const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } };
+ const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } };
+#define LEFT left_vals0, left_vals1
+#else // !AOM_ARCH_AARCH64
+#define LEFT left
+#endif // AOM_ARCH_AARCH64
+
+ for (int r = 0; r < H; r++) {
+ int y = r + 1;
+ int base_x = (-y * dx) >> 6;
+ for (int j = 0; j < W; j += 16) {
+ const int base_min_diff = min_base_x - base_x - j;
+
+ if (base_min_diff <= 0) {
+ uint8x16_t resx =
+ dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
+ vst1q_u8(dst + j, resx);
+ } else if (base_min_diff < 16) {
+ uint8x16_t resx =
+ dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
+ uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
+ uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
+ uint8x16_t resxy = vbslq_u8(mask, resy, resx);
+ vst1q_u8(dst + j, resxy);
+ } else {
+ uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
+ vst1q_u8(dst + j, resy);
+ }
+ } // for j
+ dst += stride;
+ }
+#undef LEFT
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx,
+ int dy) {
+ assert(dx > 0);
+ assert(dy > 0);
+
+ switch (bw) {
+ case 4:
+ dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ case 8:
+ dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ default:
+ dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy);
+ break;
+ }
+}
+
+/* ---------------------P R E D I C T I O N Z 3--------------------------- */
+
+static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x,
+ uint8x16x2_t *d) {
+ uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
+ uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
+
+ d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+ vreinterpretq_u16_u8(w1.val[0])));
+ d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+ vreinterpretq_u16_u8(w1.val[1])));
+}
+
+static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x,
+ uint8x8x2_t *d) {
+ uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
+ uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
+
+ *d = aom_reinterpret_u8_u16_x2(
+ vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
+}
+
+static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x,
+ uint8x8x2_t *d) {
+ uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
+ uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
+
+ d[0] = aom_reinterpret_u8_u16_x2(
+ vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
+ d[1] = aom_reinterpret_u8_u16_x2(
+ vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])));
+}
+
+static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc,
+ uint8_t *dst, ptrdiff_t pitchDst) {
+ // The same as the normal transposes in transpose_neon.h, but with a stride
+ // between consecutive vectors of elements.
+ uint8x16_t r[16];
+ uint8x16_t d[16];
+ for (int i = 0; i < 16; i++) {
+ r[i] = vld1q_u8(src + i * pitchSrc);
+ }
+ transpose_arrays_u8_16x16(r, d);
+ for (int i = 0; i < 16; i++) {
+ vst1q_u8(dst + i * pitchDst, d[i]);
+ }
+}
+
+static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src,
+ ptrdiff_t pitchSrc, uint8_t *dst,
+ ptrdiff_t pitchDst, int width,
+ int height) {
+ for (int j = 0; j < height; j += 16) {
+ for (int i = 0; i < width; i += 16) {
+ z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc,
+ dst + j * pitchDst + i, pitchDst);
+ }
+ }
+}
+
+static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[4];
+ uint8x8x2_t dest;
+
+ dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
+ z3_transpose_arrays_u8_4x4(dstvec, &dest);
+ store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]);
+ store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]);
+}
+
+static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[8];
+ uint8x8_t d[8];
+
+ dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_8x8(dstvec, d);
+ store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
+}
+
+static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[4];
+ uint8x8x2_t d[2];
+
+ dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
+ z3_transpose_arrays_u8_8x4(dstvec, d);
+ store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]);
+ store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]);
+ store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]);
+ store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]);
+}
+
+static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[8];
+ uint8x8_t d[8];
+
+ dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_8x8(dstvec, d);
+ store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]);
+}
+
+static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x16_t dstvec[8];
+ uint8x8_t d[16];
+
+ dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_16x8(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ vst1_u8(dst + i * stride, d[i]);
+ }
+}
+
+static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[16];
+ uint8x16_t d[8];
+
+ dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_8x16(dstvec, d);
+ for (int i = 0; i < 8; i++) {
+ vst1q_u8(dst + i * stride, d[i]);
+ }
+}
+
+static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x16_t dstvec[4];
+ uint8x16x2_t d[2];
+
+ dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
+ z3_transpose_arrays_u8_16x4(dstvec, d);
+ store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]);
+ store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]);
+ store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]);
+ store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]);
+}
+
+static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[16];
+ uint8x16_t d[8];
+
+ dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_8x16(dstvec, d);
+ for (int i = 0; i < 4; i++) {
+ vst1q_u8(dst + i * stride, d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8x16x2_t dstvec[16];
+ uint8x16_t d[32];
+ uint8x16_t v_zero = vdupq_n_u8(0);
+
+ dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy);
+ for (int i = 8; i < 16; i++) {
+ dstvec[i].val[0] = v_zero;
+ dstvec[i].val[1] = v_zero;
+ }
+ transpose_arrays_u8_32x16(dstvec, d);
+ for (int i = 0; i < 32; i++) {
+ vst1_u8(dst + i * stride, vget_low_u8(d[i]));
+ }
+}
+
+static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x8_t dstvec[32];
+ uint8x16_t d[16];
+
+ dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_8x16(dstvec, d);
+ transpose_arrays_u8_8x16(dstvec + 16, d + 8);
+ for (int i = 0; i < 8; i++) {
+ vst1q_u8(dst + i * stride, d[i]);
+ vst1q_u8(dst + i * stride + 16, d[i + 8]);
+ }
+}
+
+static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x16_t dstvec[16];
+ uint8x16_t d[16];
+
+ dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
+ transpose_arrays_u8_16x16(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ vst1q_u8(dst + i * stride, d[i]);
+ }
+}
+
+static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8x16x2_t dstvec[32];
+ uint8x16_t d[64];
+
+ dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy);
+ transpose_arrays_u8_32x16(dstvec, d);
+ transpose_arrays_u8_32x16(dstvec + 16, d + 32);
+ for (int i = 0; i < 32; i++) {
+ vst1q_u8(dst + i * stride, d[i]);
+ vst1q_u8(dst + i * stride + 16, d[i + 32]);
+ }
+}
+
+static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+
+ dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy);
+ z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8x16x2_t dstvec[16];
+ uint8x16_t d[32];
+
+ dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy);
+ transpose_arrays_u8_32x16(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]);
+ vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]);
+ }
+}
+
+static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x16_t dstvec[32];
+
+ dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 32; i += 16) {
+ uint8x16_t d[16];
+ transpose_arrays_u8_16x16(dstvec + i, d);
+ for (int j = 0; j < 16; j++) {
+ vst1q_u8(dst + j * stride + i, d[j]);
+ }
+ }
+}
+
+static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8_t dstT[64 * 32];
+
+ dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy);
+ z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8_t dstT[32 * 64];
+
+ dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy);
+ z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32);
+}
+
+static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ (void)upsample_left;
+ uint8_t dstT[64 * 16];
+
+ dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy);
+ z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8x16_t dstvec[64];
+
+ dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 64; i += 16) {
+ uint8x16_t d[16];
+ transpose_arrays_u8_16x16(dstvec + i, d);
+ for (int j = 0; j < 16; ++j) {
+ vst1q_u8(dst + j * stride + i, d[j]);
+ }
+ }
+}
+
+typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy);
+
+static dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+ { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon,
+ dr_prediction_z3_4x16_neon, NULL, NULL },
+ { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
+ dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL },
+ { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon,
+ dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon,
+ dr_prediction_z3_16x64_neon },
+ { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon,
+ dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
+ { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon,
+ dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon },
+};
+
+void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_left, int dx, int dy) {
+ (void)above;
+ (void)dx;
+ assert(dx == 1);
+ assert(dy > 0);
+
+ dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)];
+ assert(f != NULL);
+ f(dst, stride, left, upsample_left, dy);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// 256 - v = vneg_s8(v)
+static INLINE uint8x8_t negate_s8(const uint8x8_t v) {
+ return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
+}
+
+static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *const top_row,
+ const uint8_t *const left_column,
+ const int height) {
+ const uint8_t top_right = top_row[3];
+ const uint8_t bottom_left = left_column[height - 1];
+ const uint8_t *const weights_y = smooth_weights + height - 4;
+
+ uint8x8_t top_v = load_u8_4x1(top_row);
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+ uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
+ const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+ assert(height > 0);
+ int y = 0;
+ do {
+ const uint8x8_t left_v = vdup_n_u8(left_column[y]);
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_bl, weights_y_v, top_v);
+ const uint16x8_t weighted_left_tr =
+ vmlal_u8(weighted_tr, weights_x_v, left_v);
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
+
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0);
+ dst += stride;
+ } while (++y != height);
+}
+
+static INLINE uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
+ const uint16x8_t weighted_left_tr) {
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
+}
+
+static INLINE uint8x8_t calculate_weights_and_pred(
+ const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
+ const uint8x8_t bottom_left, const uint8x8_t weights_x,
+ const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
+ const uint16x8_t weighted_top = vmull_u8(weights_y, top);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
+ const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
+ return calculate_pred(weighted_top_bl, weighted_left_tr);
+}
+
+static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *const top_row,
+ const uint8_t *const left_column,
+ const int height) {
+ const uint8_t top_right = top_row[7];
+ const uint8_t bottom_left = left_column[height - 1];
+ const uint8_t *const weights_y = smooth_weights + height - 4;
+
+ const uint8x8_t top_v = vld1_u8(top_row);
+ const uint8x8_t top_right_v = vdup_n_u8(top_right);
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+ const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4);
+ const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+ assert(height > 0);
+ int y = 0;
+ do {
+ const uint8x8_t left_v = vdup_n_u8(left_column[y]);
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
+ const uint8x8_t result =
+ calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v,
+ weights_x_v, scaled_weights_y, weights_y_v);
+
+ vst1_u8(dst, result);
+ dst += stride;
+ } while (++y != height);
+}
+
+#define SMOOTH_NXM(W, H) \
+ void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_NXM(4, 4)
+SMOOTH_NXM(4, 8)
+SMOOTH_NXM(8, 4)
+SMOOTH_NXM(8, 8)
+SMOOTH_NXM(4, 16)
+SMOOTH_NXM(8, 16)
+SMOOTH_NXM(8, 32)
+
+#undef SMOOTH_NXM
+
+static INLINE uint8x16_t calculate_weights_and_predq(
+ const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
+ const uint8x8_t weights_y, const uint8x16_t weights_x,
+ const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
+ const uint16x8_t weighted_top_bl_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+ const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_low =
+ calculate_pred(weighted_top_bl_low, weighted_left_tr_low);
+
+ const uint16x8_t weighted_top_bl_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+ const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_high =
+ calculate_pred(weighted_top_bl_high, weighted_left_tr_high);
+
+ return vcombine_u8(result_low, result_high);
+}
+
+// 256 - v = vneg_s8(v)
+static INLINE uint8x16_t negate_s8q(const uint8x16_t v) {
+ return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
+}
+
+// For width 16 and above.
+#define SMOOTH_PREDICTOR(W) \
+ static void smooth_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t top_right = top_row[(W)-1]; \
+ const uint8_t bottom_left = left_column[height - 1]; \
+ const uint8_t *const weights_y = smooth_weights + height - 4; \
+ \
+ uint8x16_t top_v[4]; \
+ top_v[0] = vld1q_u8(top_row); \
+ if ((W) > 16) { \
+ top_v[1] = vld1q_u8(top_row + 16); \
+ if ((W) == 64) { \
+ top_v[2] = vld1q_u8(top_row + 32); \
+ top_v[3] = vld1q_u8(top_row + 48); \
+ } \
+ } \
+ \
+ const uint8x8_t top_right_v = vdup_n_u8(top_right); \
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
+ \
+ uint8x16_t weights_x_v[4]; \
+ weights_x_v[0] = vld1q_u8(smooth_weights + (W)-4); \
+ if ((W) > 16) { \
+ weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \
+ if ((W) == 64) { \
+ weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \
+ weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \
+ } \
+ } \
+ \
+ uint8x16_t scaled_weights_x[4]; \
+ scaled_weights_x[0] = negate_s8q(weights_x_v[0]); \
+ if ((W) > 16) { \
+ scaled_weights_x[1] = negate_s8q(weights_x_v[1]); \
+ if ((W) == 64) { \
+ scaled_weights_x[2] = negate_s8q(weights_x_v[2]); \
+ scaled_weights_x[3] = negate_s8q(weights_x_v[3]); \
+ } \
+ } \
+ \
+ for (int y = 0; y < height; ++y) { \
+ const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
+ const uint16x8_t weighted_bl = \
+ vmull_u8(scaled_weights_y, bottom_left_v); \
+ \
+ vst1q_u8(dst, calculate_weights_and_predq( \
+ top_v[0], left_v, top_right_v, weights_y_v, \
+ weights_x_v[0], scaled_weights_x[0], weighted_bl)); \
+ \
+ if ((W) > 16) { \
+ vst1q_u8(dst + 16, \
+ calculate_weights_and_predq( \
+ top_v[1], left_v, top_right_v, weights_y_v, \
+ weights_x_v[1], scaled_weights_x[1], weighted_bl)); \
+ if ((W) == 64) { \
+ vst1q_u8(dst + 32, \
+ calculate_weights_and_predq( \
+ top_v[2], left_v, top_right_v, weights_y_v, \
+ weights_x_v[2], scaled_weights_x[2], weighted_bl)); \
+ vst1q_u8(dst + 48, \
+ calculate_weights_and_predq( \
+ top_v[3], left_v, top_right_v, weights_y_v, \
+ weights_x_v[3], scaled_weights_x[3], weighted_bl)); \
+ } \
+ } \
+ \
+ dst += stride; \
+ } \
+ }
+
+SMOOTH_PREDICTOR(16)
+SMOOTH_PREDICTOR(32)
+SMOOTH_PREDICTOR(64)
+
+#undef SMOOTH_PREDICTOR
+
+#define SMOOTH_NXM_WIDE(W, H) \
+ void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_NXM_WIDE(16, 4)
+SMOOTH_NXM_WIDE(16, 8)
+SMOOTH_NXM_WIDE(16, 16)
+SMOOTH_NXM_WIDE(16, 32)
+SMOOTH_NXM_WIDE(16, 64)
+SMOOTH_NXM_WIDE(32, 8)
+SMOOTH_NXM_WIDE(32, 16)
+SMOOTH_NXM_WIDE(32, 32)
+SMOOTH_NXM_WIDE(32, 64)
+SMOOTH_NXM_WIDE(64, 16)
+SMOOTH_NXM_WIDE(64, 32)
+SMOOTH_NXM_WIDE(64, 64)
+
+#undef SMOOTH_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// For widths 4 and 8.
+#define SMOOTH_V_PREDICTOR(W) \
+ static void smooth_v_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t bottom_left = left_column[height - 1]; \
+ const uint8_t *const weights_y = smooth_weights + height - 4; \
+ \
+ uint8x8_t top_v; \
+ if ((W) == 4) { \
+ top_v = load_u8_4x1(top_row); \
+ } else { /* width == 8 */ \
+ top_v = vld1_u8(top_row); \
+ } \
+ \
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
+ \
+ assert(height > 0); \
+ int y = 0; \
+ do { \
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
+ \
+ const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); \
+ const uint16x8_t weighted_top_bl = \
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); \
+ const uint8x8_t pred = \
+ vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE); \
+ \
+ if ((W) == 4) { \
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
+ } else { /* width == 8 */ \
+ vst1_u8(dst, pred); \
+ } \
+ dst += stride; \
+ } while (++y != height); \
+ }
+
+SMOOTH_V_PREDICTOR(4)
+SMOOTH_V_PREDICTOR(8)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM(W, H) \
+ void aom_smooth_v_predictor_##W##x##H##_neon( \
+ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_V_NXM(4, 4)
+SMOOTH_V_NXM(4, 8)
+SMOOTH_V_NXM(4, 16)
+SMOOTH_V_NXM(8, 4)
+SMOOTH_V_NXM(8, 8)
+SMOOTH_V_NXM(8, 16)
+SMOOTH_V_NXM(8, 32)
+
+#undef SMOOTH_V_NXM
+
+static INLINE uint8x16_t calculate_vertical_weights_and_pred(
+ const uint8x16_t top, const uint8x8_t weights_y,
+ const uint16x8_t weighted_bl) {
+ const uint16x8_t pred_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+ const uint16x8_t pred_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+ const uint8x8_t pred_scaled_low =
+ vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
+ const uint8x8_t pred_scaled_high =
+ vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
+ return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+// For width 16 and above.
+#define SMOOTH_V_PREDICTOR(W) \
+ static void smooth_v_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t bottom_left = left_column[height - 1]; \
+ const uint8_t *const weights_y = smooth_weights + height - 4; \
+ \
+ uint8x16_t top_v[4]; \
+ top_v[0] = vld1q_u8(top_row); \
+ if ((W) > 16) { \
+ top_v[1] = vld1q_u8(top_row + 16); \
+ if ((W) == 64) { \
+ top_v[2] = vld1q_u8(top_row + 32); \
+ top_v[3] = vld1q_u8(top_row + 48); \
+ } \
+ } \
+ \
+ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
+ \
+ assert(height > 0); \
+ int y = 0; \
+ do { \
+ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
+ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
+ const uint16x8_t weighted_bl = \
+ vmull_u8(scaled_weights_y, bottom_left_v); \
+ \
+ const uint8x16_t pred_0 = calculate_vertical_weights_and_pred( \
+ top_v[0], weights_y_v, weighted_bl); \
+ vst1q_u8(dst, pred_0); \
+ \
+ if ((W) > 16) { \
+ const uint8x16_t pred_1 = calculate_vertical_weights_and_pred( \
+ top_v[1], weights_y_v, weighted_bl); \
+ vst1q_u8(dst + 16, pred_1); \
+ \
+ if ((W) == 64) { \
+ const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
+ top_v[2], weights_y_v, weighted_bl); \
+ vst1q_u8(dst + 32, pred_2); \
+ \
+ const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
+ top_v[3], weights_y_v, weighted_bl); \
+ vst1q_u8(dst + 48, pred_3); \
+ } \
+ } \
+ \
+ dst += stride; \
+ } while (++y != height); \
+ }
+
+SMOOTH_V_PREDICTOR(16)
+SMOOTH_V_PREDICTOR(32)
+SMOOTH_V_PREDICTOR(64)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM_WIDE(W, H) \
+ void aom_smooth_v_predictor_##W##x##H##_neon( \
+ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_V_NXM_WIDE(16, 4)
+SMOOTH_V_NXM_WIDE(16, 8)
+SMOOTH_V_NXM_WIDE(16, 16)
+SMOOTH_V_NXM_WIDE(16, 32)
+SMOOTH_V_NXM_WIDE(16, 64)
+SMOOTH_V_NXM_WIDE(32, 8)
+SMOOTH_V_NXM_WIDE(32, 16)
+SMOOTH_V_NXM_WIDE(32, 32)
+SMOOTH_V_NXM_WIDE(32, 64)
+SMOOTH_V_NXM_WIDE(64, 16)
+SMOOTH_V_NXM_WIDE(64, 32)
+SMOOTH_V_NXM_WIDE(64, 64)
+
+#undef SMOOTH_V_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+
+// For widths 4 and 8.
+#define SMOOTH_H_PREDICTOR(W) \
+ static void smooth_h_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t top_right = top_row[(W)-1]; \
+ \
+ const uint8x8_t top_right_v = vdup_n_u8(top_right); \
+ /* Over-reads for 4xN but still within the array. */ \
+ const uint8x8_t weights_x = vld1_u8(smooth_weights + (W)-4); \
+ const uint8x8_t scaled_weights_x = negate_s8(weights_x); \
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \
+ \
+ assert(height > 0); \
+ int y = 0; \
+ do { \
+ const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
+ const uint16x8_t weighted_left_tr = \
+ vmlal_u8(weighted_tr, weights_x, left_v); \
+ const uint8x8_t pred = \
+ vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE); \
+ \
+ if ((W) == 4) { \
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
+ } else { /* width == 8 */ \
+ vst1_u8(dst, pred); \
+ } \
+ dst += stride; \
+ } while (++y != height); \
+ }
+
+SMOOTH_H_PREDICTOR(4)
+SMOOTH_H_PREDICTOR(8)
+
+#undef SMOOTH_H_PREDICTOR
+
+#define SMOOTH_H_NXM(W, H) \
+ void aom_smooth_h_predictor_##W##x##H##_neon( \
+ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_H_NXM(4, 4)
+SMOOTH_H_NXM(4, 8)
+SMOOTH_H_NXM(4, 16)
+SMOOTH_H_NXM(8, 4)
+SMOOTH_H_NXM(8, 8)
+SMOOTH_H_NXM(8, 16)
+SMOOTH_H_NXM(8, 32)
+
+#undef SMOOTH_H_NXM
+
+static INLINE uint8x16_t calculate_horizontal_weights_and_pred(
+ const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
+ const uint8x16_t scaled_weights_x) {
+ const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t pred_scaled_low =
+ vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE);
+
+ const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+ const uint8x8_t pred_scaled_high =
+ vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE);
+
+ return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+// For width 16 and above.
+#define SMOOTH_H_PREDICTOR(W) \
+ static void smooth_h_##W##xh_neon( \
+ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
+ const uint8_t *const left_column, const int height) { \
+ const uint8_t top_right = top_row[(W)-1]; \
+ \
+ const uint8x8_t top_right_v = vdup_n_u8(top_right); \
+ \
+ uint8x16_t weights_x[4]; \
+ weights_x[0] = vld1q_u8(smooth_weights + (W)-4); \
+ if ((W) > 16) { \
+ weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \
+ if ((W) == 64) { \
+ weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \
+ weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \
+ } \
+ } \
+ \
+ uint8x16_t scaled_weights_x[4]; \
+ scaled_weights_x[0] = negate_s8q(weights_x[0]); \
+ if ((W) > 16) { \
+ scaled_weights_x[1] = negate_s8q(weights_x[1]); \
+ if ((W) == 64) { \
+ scaled_weights_x[2] = negate_s8q(weights_x[2]); \
+ scaled_weights_x[3] = negate_s8q(weights_x[3]); \
+ } \
+ } \
+ \
+ assert(height > 0); \
+ int y = 0; \
+ do { \
+ const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
+ \
+ const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred( \
+ left_v, top_right_v, weights_x[0], scaled_weights_x[0]); \
+ vst1q_u8(dst, pred_0); \
+ \
+ if ((W) > 16) { \
+ const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred( \
+ left_v, top_right_v, weights_x[1], scaled_weights_x[1]); \
+ vst1q_u8(dst + 16, pred_1); \
+ \
+ if ((W) == 64) { \
+ const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \
+ left_v, top_right_v, weights_x[2], scaled_weights_x[2]); \
+ vst1q_u8(dst + 32, pred_2); \
+ \
+ const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \
+ left_v, top_right_v, weights_x[3], scaled_weights_x[3]); \
+ vst1q_u8(dst + 48, pred_3); \
+ } \
+ } \
+ dst += stride; \
+ } while (++y != height); \
+ }
+
+SMOOTH_H_PREDICTOR(16)
+SMOOTH_H_PREDICTOR(32)
+SMOOTH_H_PREDICTOR(64)
+
+#undef SMOOTH_H_PREDICTOR
+
+#define SMOOTH_H_NXM_WIDE(W, H) \
+ void aom_smooth_h_predictor_##W##x##H##_neon( \
+ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+ const uint8_t *left) { \
+ smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+ }
+
+SMOOTH_H_NXM_WIDE(16, 4)
+SMOOTH_H_NXM_WIDE(16, 8)
+SMOOTH_H_NXM_WIDE(16, 16)
+SMOOTH_H_NXM_WIDE(16, 32)
+SMOOTH_H_NXM_WIDE(16, 64)
+SMOOTH_H_NXM_WIDE(32, 8)
+SMOOTH_H_NXM_WIDE(32, 16)
+SMOOTH_H_NXM_WIDE(32, 32)
+SMOOTH_H_NXM_WIDE(32, 64)
+SMOOTH_H_NXM_WIDE(64, 16)
+SMOOTH_H_NXM_WIDE(64, 32)
+SMOOTH_H_NXM_WIDE(64, 64)
+
+#undef SMOOTH_H_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// PAETH
+
+static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
+ const uint8_t *const top_row,
+ const uint8_t *const left_column,
+ int width, int height) {
+ const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+ uint8x8_t top;
+ if (width == 4) {
+ top = load_u8_4x1(top_row);
+ } else { // width == 8
+ top = vld1_u8(top_row);
+ }
+
+ assert(height > 0);
+ int y = 0;
+ do {
+ const uint8x8_t left = vdup_n_u8(left_column[y]);
+
+ const uint8x8_t left_dist = vabd_u8(top, top_left);
+ const uint8x8_t top_dist = vabd_u8(left, top_left);
+ const uint16x8_t top_left_dist =
+ vabdq_u16(vaddl_u8(top, left), top_left_x2);
+
+ const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
+ const uint8x8_t left_le_top_left =
+ vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
+ const uint8x8_t top_le_top_left =
+ vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
+
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint8x8_t result = vbsl_u8(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ result = vbsl_u8(left_or_top_mask, result, top_left);
+
+ if (width == 4) {
+ store_u8_4x1(dest, result);
+ } else { // width == 8
+ vst1_u8(dest, result);
+ }
+ dest += stride;
+ } while (++y != height);
+}
+
+#define PAETH_NXM(W, H) \
+ void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
+ }
+
+PAETH_NXM(4, 4)
+PAETH_NXM(4, 8)
+PAETH_NXM(8, 4)
+PAETH_NXM(8, 8)
+PAETH_NXM(8, 16)
+
+PAETH_NXM(4, 16)
+PAETH_NXM(8, 32)
+
+// Calculate X distance <= TopLeft distance and pack the resulting mask into
+// uint8x8_t.
+static INLINE uint8x16_t x_le_top_left(const uint8x16_t x_dist,
+ const uint16x8_t top_left_dist_low,
+ const uint16x8_t top_left_dist_high) {
+ const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
+ vqmovn_u16(top_left_dist_high));
+ return vcleq_u8(x_dist, top_left_dist);
+}
+
+// Select the closest values and collect them.
+static INLINE uint8x16_t select_paeth(const uint8x16_t top,
+ const uint8x16_t left,
+ const uint8x16_t top_left,
+ const uint8x16_t left_le_top,
+ const uint8x16_t left_le_top_left,
+ const uint8x16_t top_le_top_left) {
+ // if (left_dist <= top_dist && left_dist <= top_left_dist)
+ const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
+ // dest[x] = left_column[y];
+ // Fill all the unused spaces with 'top'. They will be overwritten when
+ // the positions for top_left are known.
+ uint8x16_t result = vbslq_u8(left_mask, left, top);
+ // else if (top_dist <= top_left_dist)
+ // dest[x] = top_row[x];
+ // Add these values to the mask. They were already set.
+ const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
+ // else
+ // dest[x] = top_left;
+ return vbslq_u8(left_or_top_mask, result, top_left);
+}
+
+// Generate numbered and high/low versions of top_left_dist.
+#define TOP_LEFT_DIST(num) \
+ const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \
+ vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
+ const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \
+ vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
+
+// Generate numbered versions of XLeTopLeft with x = left.
+#define LEFT_LE_TOP_LEFT(num) \
+ const uint8x16_t left_le_top_left_##num = \
+ x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \
+ top_left_##num##_dist_high)
+
+// Generate numbered versions of XLeTopLeft with x = top.
+#define TOP_LE_TOP_LEFT(num) \
+ const uint8x16_t top_le_top_left_##num = x_le_top_left( \
+ top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
+
+static INLINE void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
+ const uint8_t *const top_row,
+ const uint8_t *const left_column,
+ int width, int height) {
+ const uint8x16_t top_left = vdupq_n_u8(top_row[-1]);
+ const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+ uint8x16_t top[4];
+ top[0] = vld1q_u8(top_row);
+ if (width > 16) {
+ top[1] = vld1q_u8(top_row + 16);
+ if (width == 64) {
+ top[2] = vld1q_u8(top_row + 32);
+ top[3] = vld1q_u8(top_row + 48);
+ }
+ }
+
+ assert(height > 0);
+ int y = 0;
+ do {
+ const uint8x16_t left = vdupq_n_u8(left_column[y]);
+
+ const uint8x16_t top_dist = vabdq_u8(left, top_left);
+
+ const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
+ TOP_LEFT_DIST(0);
+ const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
+ LEFT_LE_TOP_LEFT(0);
+ TOP_LE_TOP_LEFT(0);
+
+ const uint8x16_t result_0 =
+ select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
+ top_le_top_left_0);
+ vst1q_u8(dest, result_0);
+
+ if (width > 16) {
+ const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
+ TOP_LEFT_DIST(1);
+ const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
+ LEFT_LE_TOP_LEFT(1);
+ TOP_LE_TOP_LEFT(1);
+
+ const uint8x16_t result_1 =
+ select_paeth(top[1], left, top_left, left_1_le_top,
+ left_le_top_left_1, top_le_top_left_1);
+ vst1q_u8(dest + 16, result_1);
+
+ if (width == 64) {
+ const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
+ TOP_LEFT_DIST(2);
+ const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
+ LEFT_LE_TOP_LEFT(2);
+ TOP_LE_TOP_LEFT(2);
+
+ const uint8x16_t result_2 =
+ select_paeth(top[2], left, top_left, left_2_le_top,
+ left_le_top_left_2, top_le_top_left_2);
+ vst1q_u8(dest + 32, result_2);
+
+ const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
+ TOP_LEFT_DIST(3);
+ const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
+ LEFT_LE_TOP_LEFT(3);
+ TOP_LE_TOP_LEFT(3);
+
+ const uint8x16_t result_3 =
+ select_paeth(top[3], left, top_left, left_3_le_top,
+ left_le_top_left_3, top_le_top_left_3);
+ vst1q_u8(dest + 48, result_3);
+ }
+ }
+
+ dest += stride;
+ } while (++y != height);
+}
+
+#define PAETH_NXM_WIDE(W, H) \
+ void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
+ const uint8_t *above, \
+ const uint8_t *left) { \
+ paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
+ }
+
+PAETH_NXM_WIDE(16, 8)
+PAETH_NXM_WIDE(16, 16)
+PAETH_NXM_WIDE(16, 32)
+PAETH_NXM_WIDE(32, 16)
+PAETH_NXM_WIDE(32, 32)
+PAETH_NXM_WIDE(32, 64)
+PAETH_NXM_WIDE(64, 32)
+PAETH_NXM_WIDE(64, 64)
+
+PAETH_NXM_WIDE(16, 4)
+PAETH_NXM_WIDE(16, 64)
+PAETH_NXM_WIDE(32, 8)
+PAETH_NXM_WIDE(64, 16)
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
new file mode 100644
index 0000000000..7c64be1253
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,1045 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
+ uint8x8_t p0q0, const uint8_t blimit,
+ const uint8_t limit) {
+ // Calculate mask values for four samples
+ uint32x2x2_t p0q0_p1q1;
+ uint16x8_t temp_16x8;
+ uint16x4_t temp0_16x4, temp1_16x4;
+ uint8x8_t mask_8x8, temp_8x8;
+ const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+ const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+ mask_8x8 = vabd_u8(p3q3, p2q2);
+ mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1));
+ mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+ mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+ temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+ vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+ temp_16x8 = vmovl_u8(temp_8x8);
+ temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+ temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+ temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+ temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+ temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
+ const uint8_t blimit, const uint8_t limit) {
+ uint32x2x2_t p0q0_p1q1;
+ uint16x8_t temp_16x8;
+ uint16x4_t temp0_16x4, temp1_16x4;
+ const uint16x4_t blimit_16x4 = vdup_n_u16(blimit);
+ const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+ uint8x8_t mask_8x8, temp_8x8;
+
+ mask_8x8 = vabd_u8(p1q1, p0q0);
+ mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+ temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+ vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+ temp_16x8 = vmovl_u8(temp_8x8);
+ temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+ temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+ temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+ temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+ temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
+ uint8x8_t p1q1, uint8x8_t p0q0) {
+ const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1
+ uint8x8_t flat_8x8, temp_8x8;
+
+ flat_8x8 = vabd_u8(p1q1, p0q0);
+ flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+ flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0));
+ flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+ flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+ return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
+ uint8x8_t p0q0) {
+ const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1
+ uint8x8_t flat_8x8, temp_8x8;
+
+ flat_8x8 = vabd_u8(p1q1, p0q0);
+ flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+ flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+ flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+ return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
+ uint8x8_t p0q0, const uint8_t blimit,
+ const uint8_t limit) {
+ // Calculate mask3 values for four samples
+ uint32x2x2_t p0q0_p1q1;
+ uint16x8_t temp_16x8;
+ uint16x4_t temp0_16x4, temp1_16x4;
+ uint8x8_t mask_8x8, temp_8x8;
+ const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+ const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+ mask_8x8 = vabd_u8(p2q2, p1q1);
+ mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+ mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+ temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+ vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+ temp_16x8 = vmovl_u8(temp_8x8);
+ temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+ temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+ temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+ temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+ temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ return mask_8x8;
+}
+
+static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
+ uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+ uint8x8_t *p0q0, const uint8_t blimit,
+ const uint8_t limit, const uint8_t thresh) {
+ uint16x8_t out;
+ uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
+ out_f14_pq5;
+ uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+ uint8x8_t out_f4_pq0, out_f4_pq1;
+ uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
+ uint8x8_t q0p0, q1p1, q2p2;
+
+ // Calculate filter masks
+ mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+ flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+ flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
+ {
+ // filter 4
+ int32x2x2_t ps0_qs0, ps1_qs1;
+ int16x8_t filter_s16;
+ const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+ uint8x8_t temp0_8x8, temp1_8x8;
+ int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+ int8x8_t op0, oq0, op1, oq1;
+ int8x8_t pq_s0, pq_s1;
+ int8x8_t filter_s8, filter1_s8, filter2_s8;
+ int8x8_t hev_8x8;
+ const int8x8_t sign_mask = vdup_n_s8(0x80);
+ const int8x8_t val_4 = vdup_n_s8(4);
+ const int8x8_t val_3 = vdup_n_s8(3);
+
+ pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+ pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+ ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+ ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+ ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+ qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+ ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+ qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+ // hev_mask
+ temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+ temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+ hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+ // add outer taps if we have high edge variance
+ filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ // inner taps
+ temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+ filter_s16 = vmovl_s8(filter_s8);
+ filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+ filter_s8 = vqmovn_s16(filter_s16);
+ filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+ filter1_s8 = vqadd_s8(filter_s8, val_4);
+ filter2_s8 = vqadd_s8(filter_s8, val_3);
+ filter1_s8 = vshr_n_s8(filter1_s8, 3);
+ filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+ oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+ op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+ hev_8x8 = vmvn_s8(hev_8x8);
+ filter_s8 = vrshr_n_s8(filter1_s8, 1);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+ op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+ out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+ out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+ }
+ // reverse p and q
+ q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+ q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+ q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+ {
+ // filter 8
+ uint16x8_t out_pq0, out_pq1, out_pq2;
+ out = vaddl_u8(*p3q3, *p2q2);
+ out = vaddw_u8(out, *p1q1);
+ out = vaddw_u8(out, *p0q0);
+
+ out = vaddw_u8(out, q0p0);
+ out_pq1 = vaddw_u8(out, *p3q3);
+ out_pq2 = vaddw_u8(out_pq1, *p3q3);
+ out_pq2 = vaddw_u8(out_pq2, *p2q2);
+ out_pq1 = vaddw_u8(out_pq1, *p1q1);
+ out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+ out_pq0 = vaddw_u8(out, *p0q0);
+ out_pq0 = vaddw_u8(out_pq0, q1p1);
+ out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+ out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+ out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+ out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+ }
+ {
+ // filter 14
+ uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
+ uint16x8_t p6q6_2, p6q6_temp, qp_sum;
+ uint8x8_t qp_rev;
+
+ out = vaddw_u8(out, *p4q4);
+ out = vaddw_u8(out, *p5q5);
+ out = vaddw_u8(out, *p6q6);
+
+ out_pq5 = vaddw_u8(out, *p4q4);
+ out_pq4 = vaddw_u8(out_pq5, *p3q3);
+ out_pq3 = vaddw_u8(out_pq4, *p2q2);
+
+ out_pq5 = vaddw_u8(out_pq5, *p5q5);
+ out_pq4 = vaddw_u8(out_pq4, *p5q5);
+
+ out_pq0 = vaddw_u8(out, *p1q1);
+ out_pq1 = vaddw_u8(out_pq0, *p2q2);
+ out_pq2 = vaddw_u8(out_pq1, *p3q3);
+
+ out_pq0 = vaddw_u8(out_pq0, *p0q0);
+ out_pq1 = vaddw_u8(out_pq1, *p0q0);
+
+ out_pq1 = vaddw_u8(out_pq1, *p6q6);
+ p6q6_2 = vaddl_u8(*p6q6, *p6q6);
+ out_pq2 = vaddq_u16(out_pq2, p6q6_2);
+ p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
+ out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
+ p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
+ out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
+ p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
+ out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
+
+ out_pq4 = vaddw_u8(out_pq4, q1p1);
+
+ qp_sum = vaddl_u8(q2p2, q1p1);
+ out_pq3 = vaddq_u16(out_pq3, qp_sum);
+
+ qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
+ qp_sum = vaddw_u8(qp_sum, qp_rev);
+ out_pq2 = vaddq_u16(out_pq2, qp_sum);
+
+ qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
+ qp_sum = vaddw_u8(qp_sum, qp_rev);
+ out_pq1 = vaddq_u16(out_pq1, qp_sum);
+
+ qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
+ qp_sum = vaddw_u8(qp_sum, qp_rev);
+ out_pq0 = vaddq_u16(out_pq0, qp_sum);
+
+ out_pq0 = vaddw_u8(out_pq0, q0p0);
+
+ out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
+ out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
+ out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
+ out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
+ out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
+ out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
+ }
+ {
+ uint8x8_t filter4_cond, filter8_cond, filter14_cond;
+ filter8_cond = vand_u8(flat_8x8, mask_8x8);
+ filter4_cond = vmvn_u8(filter8_cond);
+ filter14_cond = vand_u8(filter8_cond, flat2_8x8);
+
+ // filter4 outputs
+ *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+ // filter8 outputs
+ *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+ *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+
+ // filter14 outputs
+ *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
+ *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
+ *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
+ *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
+ *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
+ }
+}
+
+static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+ uint8x8_t *p0q0, const uint8_t blimit,
+ const uint8_t limit, const uint8_t thresh) {
+ uint16x8_t out;
+ uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+ uint8x8_t out_f4_pq0, out_f4_pq1;
+ uint8x8_t mask_8x8, flat_8x8;
+
+ // Calculate filter masks
+ mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+ flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+ {
+ // filter 4
+ int32x2x2_t ps0_qs0, ps1_qs1;
+ int16x8_t filter_s16;
+ const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+ uint8x8_t temp0_8x8, temp1_8x8;
+ int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+ int8x8_t op0, oq0, op1, oq1;
+ int8x8_t pq_s0, pq_s1;
+ int8x8_t filter_s8, filter1_s8, filter2_s8;
+ int8x8_t hev_8x8;
+ const int8x8_t sign_mask = vdup_n_s8(0x80);
+ const int8x8_t val_4 = vdup_n_s8(4);
+ const int8x8_t val_3 = vdup_n_s8(3);
+
+ pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+ pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+ ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+ ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+ ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+ qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+ ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+ qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+ // hev_mask
+ temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+ temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+ hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+ // add outer taps if we have high edge variance
+ filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ // inner taps
+ temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+ filter_s16 = vmovl_s8(filter_s8);
+ filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+ filter_s8 = vqmovn_s16(filter_s16);
+ filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+ filter1_s8 = vqadd_s8(filter_s8, val_4);
+ filter2_s8 = vqadd_s8(filter_s8, val_3);
+ filter1_s8 = vshr_n_s8(filter1_s8, 3);
+ filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+ oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+ op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+ hev_8x8 = vmvn_s8(hev_8x8);
+ filter_s8 = vrshr_n_s8(filter1_s8, 1);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+ op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+ out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+ out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+ }
+ {
+ // filter 8
+ uint16x8_t out_pq0, out_pq1, out_pq2;
+ uint8x8_t q0p0, q1p1, q2p2;
+
+ out = vaddl_u8(*p3q3, *p2q2);
+ out = vaddw_u8(out, *p1q1);
+ out = vaddw_u8(out, *p0q0);
+
+ // reverse p and q
+ q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+ q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+ q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+
+ out = vaddw_u8(out, q0p0);
+ out_pq1 = vaddw_u8(out, *p3q3);
+ out_pq2 = vaddw_u8(out_pq1, *p3q3);
+ out_pq2 = vaddw_u8(out_pq2, *p2q2);
+ out_pq1 = vaddw_u8(out_pq1, *p1q1);
+ out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+ out_pq0 = vaddw_u8(out, *p0q0);
+ out_pq0 = vaddw_u8(out_pq0, q1p1);
+ out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+ out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+ out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+ out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+ }
+ {
+ uint8x8_t filter4_cond, filter8_cond;
+ filter8_cond = vand_u8(flat_8x8, mask_8x8);
+ filter4_cond = vmvn_u8(filter8_cond);
+
+ // filter4 outputs
+ *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+ // filter8 outputs
+ *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+ *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+ }
+}
+
+static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
+ const uint8_t blimit, const uint8_t limit,
+ const uint8_t thresh) {
+ uint16x8_t out;
+ uint8x8_t out_f6_pq0, out_f6_pq1;
+ uint8x8_t out_f4_pq0, out_f4_pq1;
+ uint8x8_t mask_8x8, flat_8x8;
+
+ // Calculate filter masks
+ mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
+ flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
+ {
+ // filter 4
+ int32x2x2_t ps0_qs0, ps1_qs1;
+ int16x8_t filter_s16;
+ const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+ uint8x8_t temp0_8x8, temp1_8x8;
+ int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+ int8x8_t op0, oq0, op1, oq1;
+ int8x8_t pq_s0, pq_s1;
+ int8x8_t filter_s8, filter1_s8, filter2_s8;
+ int8x8_t hev_8x8;
+ const int8x8_t sign_mask = vdup_n_s8(0x80);
+ const int8x8_t val_4 = vdup_n_s8(4);
+ const int8x8_t val_3 = vdup_n_s8(3);
+
+ pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+ pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+ ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+ ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+ ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+ qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+ ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+ qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+ // hev_mask
+ temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+ temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+ hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+ // add outer taps if we have high edge variance
+ filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ // inner taps
+ temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+ filter_s16 = vmovl_s8(filter_s8);
+ filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+ filter_s8 = vqmovn_s16(filter_s16);
+ filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+ filter1_s8 = vqadd_s8(filter_s8, val_4);
+ filter2_s8 = vqadd_s8(filter_s8, val_3);
+ filter1_s8 = vshr_n_s8(filter1_s8, 3);
+ filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+ oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+ op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+ filter_s8 = vrshr_n_s8(filter1_s8, 1);
+ filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+ oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+ op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+ out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+ out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+ }
+ {
+ // filter 6
+ uint16x8_t out_pq0, out_pq1;
+ uint8x8_t pq_rev;
+
+ out = vaddl_u8(*p0q0, *p1q1);
+ out = vaddq_u16(out, out);
+ out = vaddw_u8(out, *p2q2);
+
+ pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+ out = vaddw_u8(out, pq_rev);
+
+ out_pq0 = vaddw_u8(out, pq_rev);
+ pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+ out_pq0 = vaddw_u8(out_pq0, pq_rev);
+
+ out_pq1 = vaddw_u8(out, *p2q2);
+ out_pq1 = vaddw_u8(out_pq1, *p2q2);
+
+ out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
+ out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
+ }
+ {
+ uint8x8_t filter4_cond, filter6_cond;
+ filter6_cond = vand_u8(flat_8x8, mask_8x8);
+ filter4_cond = vmvn_u8(filter6_cond);
+
+ // filter4 outputs
+ *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+ // filter6 outputs
+ *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
+ *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
+ }
+}
+
+static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
+ const uint8_t limit, const uint8_t thresh) {
+ int32x2x2_t ps0_qs0, ps1_qs1;
+ int16x8_t filter_s16;
+ const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+ uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
+ int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+ int8x8_t op0, oq0, op1, oq1;
+ int8x8_t pq_s0, pq_s1;
+ int8x8_t filter_s8, filter1_s8, filter2_s8;
+ int8x8_t hev_8x8;
+ const int8x8_t sign_mask = vdup_n_s8(0x80);
+ const int8x8_t val_4 = vdup_n_s8(4);
+ const int8x8_t val_3 = vdup_n_s8(3);
+
+ // Calculate filter mask
+ mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+
+ pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+ pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+ ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+ ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+ ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+ qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+ ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+ qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+ // hev_mask
+ temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+ temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+ hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+ // add outer taps if we have high edge variance
+ filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ // inner taps
+ temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+ filter_s16 = vmovl_s8(filter_s8);
+ filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+ filter_s8 = vqmovn_s16(filter_s16);
+ filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+ filter1_s8 = vqadd_s8(filter_s8, val_4);
+ filter2_s8 = vqadd_s8(filter_s8, val_3);
+ filter1_s8 = vshr_n_s8(filter1_s8, 3);
+ filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+ oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+ op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+ filter_s8 = vrshr_n_s8(filter1_s8, 1);
+ filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+ oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+ op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+ *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+ *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+}
+
+void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x16_t row0, row1, row2, row3;
+ uint8x8_t pxp3, p6p2, p5p1, p4p0;
+ uint8x8_t q0q4, q1q5, q2q6, q3qy;
+ uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3;
+ uint32x2_t pq_rev;
+ uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
+
+ // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+ // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+ // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+ // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+ load_u8_16x4(src - 8, stride, &row0, &row1, &row2, &row3);
+
+ pxp3 = vget_low_u8(row0);
+ p6p2 = vget_low_u8(row1);
+ p5p1 = vget_low_u8(row2);
+ p4p0 = vget_low_u8(row3);
+ transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+ q0q4 = vget_high_u8(row0);
+ q1q5 = vget_high_u8(row1);
+ q2q6 = vget_high_u8(row2);
+ q3qy = vget_high_u8(row3);
+ transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
+ pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5));
+ p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4));
+ p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6));
+ p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev);
+
+ p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+ p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+ p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+ p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+ p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+ p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+ p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+
+ lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+ *thresh);
+
+ pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3));
+ p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1));
+ p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0));
+ p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2));
+
+ pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]);
+ p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]);
+ p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]);
+ p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]);
+
+ q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+ q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+ q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+ q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+ transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+ pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
+ p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+ p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+ p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+ transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+ row0 = vcombine_u8(pxp3, q0q4);
+ row1 = vcombine_u8(p6p2, q1q5);
+ row2 = vcombine_u8(p5p1, q2q6);
+ row3 = vcombine_u8(p4p0, q3qy);
+
+ store_u8_16x4(src - 8, stride, row0, row1, row2, row3);
+}
+
+void aom_lpf_vertical_14_dual_neon(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_14_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_vertical_14_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_vertical_14_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+ thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint32x2x2_t p2q2_p1q1, p3q3_p0q0;
+ uint32x2_t pq_rev;
+ uint8x8_t p3q0, p2q1, p1q2, p0q3;
+ uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+ // row0: p3 p2 p1 p0 | q0 q1 q2 q3
+ // row1: p3 p2 p1 p0 | q0 q1 q2 q3
+ // row2: p3 p2 p1 p0 | q0 q1 q2 q3
+ // row3: p3 p2 p1 p0 | q0 q1 q2 q3
+ load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
+
+ transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
+ p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+ p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+ p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+ p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+ p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+ p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+
+ lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+ p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+ p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+ p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+ p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+ p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+ p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+ transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+ store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
+}
+
+void aom_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ aom_lpf_vertical_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_vertical_8_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+ thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
+ uint32x2_t pq_rev;
+ uint8x8_t pxq0, p2q1, p1q2, p0qy;
+ uint8x8_t p0q0, p1q1, p2q2, pxqy;
+
+ // row0: px p2 p1 p0 | q0 q1 q2 qy
+ // row1: px p2 p1 p0 | q0 q1 q2 qy
+ // row2: px p2 p1 p0 | q0 q1 q2 qy
+ // row3: px p2 p1 p0 | q0 q1 q2 qy
+ load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
+
+ transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
+ pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+ p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+ p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+ p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+ p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+ pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+
+ lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+ pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+ p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+ p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+ p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+ p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+ pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+ transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+ store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
+}
+
+void aom_lpf_vertical_6_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_6_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ aom_lpf_vertical_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_vertical_6_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+ thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
+ uint32x2_t pq_rev;
+ uint8x8_t p1p0, q0q1;
+ uint8x8_t p0q0, p1q1;
+
+ // row0: p1 p0 | q0 q1
+ // row1: p1 p0 | q0 q1
+ // row2: p1 p0 | q0 q1
+ // row3: p1 p0 | q0 q1
+ load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1);
+
+ transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
+
+ p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
+
+ pq_rev = vrev64_u32(p1q0_p0q1.val[1]);
+ p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev);
+
+ p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]);
+ p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]);
+
+ lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0));
+
+ p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
+ q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
+
+ transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
+
+ store_u8x4_strided_x2(src - 2, 2 * stride, p1p0);
+ store_u8x4_strided_x2(src + stride - 2, 2 * stride, q0q1);
+}
+
+void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ aom_lpf_vertical_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_vertical_4_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+ thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p6q6 = load_u8_4x2(src - 7 * stride, 13 * stride);
+ uint8x8_t p5q5 = load_u8_4x2(src - 6 * stride, 11 * stride);
+ uint8x8_t p4q4 = load_u8_4x2(src - 5 * stride, 9 * stride);
+ uint8x8_t p3q3 = load_u8_4x2(src - 4 * stride, 7 * stride);
+ uint8x8_t p2q2 = load_u8_4x2(src - 3 * stride, 5 * stride);
+ uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+ uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
+
+ lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+ *thresh);
+
+ store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0);
+ store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1);
+ store_u8x4_strided_x2(src - 3 * stride, 5 * stride, p2q2);
+ store_u8x4_strided_x2(src - 4 * stride, 7 * stride, p3q3);
+ store_u8x4_strided_x2(src - 5 * stride, 9 * stride, p4q4);
+ store_u8x4_strided_x2(src - 6 * stride, 11 * stride, p5q5);
+}
+
+void aom_lpf_horizontal_14_dual_neon(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_14_quad_neon(uint8_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_horizontal_14_dual_neon(s, pitch, blimit, limit, thresh, blimit,
+ limit, thresh);
+ aom_lpf_horizontal_14_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+ blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+ p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride)));
+ p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+ p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+ p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+ p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+ vreinterpret_u32_u8(p0q0), 1));
+ p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+ vreinterpret_u32_u8(p1q1), 1));
+ p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+ vreinterpret_u32_u8(p2q2), 1));
+ p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride),
+ vreinterpret_u32_u8(p3q3), 1));
+
+ lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0);
+ vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+ vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+ vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+ vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+ vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+ vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+ vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1);
+}
+
+void aom_lpf_horizontal_8_dual_neon(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_8_quad_neon(uint8_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_horizontal_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_horizontal_8_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+ blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p0q0, p1q1, p2q2;
+
+ p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+ p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+ p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+ p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+ vreinterpret_u32_u8(p0q0), 1));
+ p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+ vreinterpret_u32_u8(p1q1), 1));
+ p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+ vreinterpret_u32_u8(p2q2), 1));
+
+ lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+ vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+ vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+ vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+ vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+ vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+}
+
+void aom_lpf_horizontal_6_dual_neon(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_6_quad_neon(uint8_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_horizontal_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_horizontal_6_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+ blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+ uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
+
+ lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0);
+ store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1);
+}
+
+void aom_lpf_horizontal_4_dual_neon(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ aom_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0);
+ aom_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_4_quad_neon(uint8_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh) {
+ aom_lpf_horizontal_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+ thresh);
+ aom_lpf_horizontal_4_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+ blimit, limit, thresh);
+}
diff --git a/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c b/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c
new file mode 100644
index 0000000000..8f65b805ec
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+ const uint8x16_t s0,
+ const uint8x16_t a0,
+ const uint8x16_t b0,
+ const uint8x16_t m0) {
+ uint8x16_t m0_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
+ uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m0), vget_low_u8(a0));
+ uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m0), vget_high_u8(a0));
+ blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m0_inv), vget_low_u8(b0));
+ blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m0_inv), vget_high_u8(b0));
+
+ uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
+ uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
+ uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+ return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
+}
+
+static INLINE void masked_inv_sadwxhx4d_large_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int width, int height, int h_overflow) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ int h_limit = height > h_overflow ? h_overflow : height;
+
+ int ref_offset = 0;
+ int i = 0;
+ do {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src + j);
+ uint8x16_t p0 = vld1q_u8(second_pred + j);
+ uint8x16_t m0 = vld1q_u8(mask + j);
+ sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, p0,
+ vld1q_u8(ref[0] + ref_offset + j), m0);
+ sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, p0,
+ vld1q_u8(ref[1] + ref_offset + j), m0);
+ sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, p0,
+ vld1q_u8(ref[2] + ref_offset + j), m0);
+ sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, p0,
+ vld1q_u8(ref[3] + ref_offset + j), m0);
+
+ uint8x16_t s1 = vld1q_u8(src + j + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + j + 16);
+ uint8x16_t m1 = vld1q_u8(mask + j + 16);
+ sum_hi[0] = masked_sad_16x1_neon(
+ sum_hi[0], s1, p1, vld1q_u8(ref[0] + ref_offset + j + 16), m1);
+ sum_hi[1] = masked_sad_16x1_neon(
+ sum_hi[1], s1, p1, vld1q_u8(ref[1] + ref_offset + j + 16), m1);
+ sum_hi[2] = masked_sad_16x1_neon(
+ sum_hi[2], s1, p1, vld1q_u8(ref[2] + ref_offset + j + 16), m1);
+ sum_hi[3] = masked_sad_16x1_neon(
+ sum_hi[3], s1, p1, vld1q_u8(ref[3] + ref_offset + j + 16), m1);
+
+ j += 32;
+ } while (j < width);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += width;
+ mask += mask_stride;
+ } while (++i < h_limit);
+
+ sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+ sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+ sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+ sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+ sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+ sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+ sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
+ sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
+
+ h_limit += h_overflow;
+ } while (i < height);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void masked_inv_sad128xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ masked_inv_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+ mask, mask_stride, res, 128, h, 32);
+}
+
+static INLINE void masked_inv_sad64xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ masked_inv_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+ mask, mask_stride, res, 64, h, 64);
+}
+
+static INLINE void masked_sadwxhx4d_large_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int width, int height, int h_overflow) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ int h_limit = height > h_overflow ? h_overflow : height;
+
+ int ref_offset = 0;
+ int i = 0;
+ do {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src + j);
+ uint8x16_t p0 = vld1q_u8(second_pred + j);
+ uint8x16_t m0 = vld1q_u8(mask + j);
+ sum_lo[0] = masked_sad_16x1_neon(
+ sum_lo[0], s0, vld1q_u8(ref[0] + ref_offset + j), p0, m0);
+ sum_lo[1] = masked_sad_16x1_neon(
+ sum_lo[1], s0, vld1q_u8(ref[1] + ref_offset + j), p0, m0);
+ sum_lo[2] = masked_sad_16x1_neon(
+ sum_lo[2], s0, vld1q_u8(ref[2] + ref_offset + j), p0, m0);
+ sum_lo[3] = masked_sad_16x1_neon(
+ sum_lo[3], s0, vld1q_u8(ref[3] + ref_offset + j), p0, m0);
+
+ uint8x16_t s1 = vld1q_u8(src + j + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + j + 16);
+ uint8x16_t m1 = vld1q_u8(mask + j + 16);
+ sum_hi[0] = masked_sad_16x1_neon(
+ sum_hi[0], s1, vld1q_u8(ref[0] + ref_offset + j + 16), p1, m1);
+ sum_hi[1] = masked_sad_16x1_neon(
+ sum_hi[1], s1, vld1q_u8(ref[1] + ref_offset + j + 16), p1, m1);
+ sum_hi[2] = masked_sad_16x1_neon(
+ sum_hi[2], s1, vld1q_u8(ref[2] + ref_offset + j + 16), p1, m1);
+ sum_hi[3] = masked_sad_16x1_neon(
+ sum_hi[3], s1, vld1q_u8(ref[3] + ref_offset + j + 16), p1, m1);
+
+ j += 32;
+ } while (j < width);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += width;
+ mask += mask_stride;
+ } while (++i < h_limit);
+
+ sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+ sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+ sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+ sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+ sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+ sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+ sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
+ sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
+
+ h_limit += h_overflow;
+ } while (i < height);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void masked_sad128xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ masked_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+ mask, mask_stride, res, 128, h, 32);
+}
+
+static INLINE void masked_sad64xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ masked_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+ mask, mask_stride, res, 64, h, 64);
+}
+
+static INLINE void masked_inv_sad32xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t m0 = vld1q_u8(mask);
+ sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, p0,
+ vld1q_u8(ref[0] + ref_offset), m0);
+ sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, p0,
+ vld1q_u8(ref[1] + ref_offset), m0);
+ sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, p0,
+ vld1q_u8(ref[2] + ref_offset), m0);
+ sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, p0,
+ vld1q_u8(ref[3] + ref_offset), m0);
+
+ uint8x16_t s1 = vld1q_u8(src + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t m1 = vld1q_u8(mask + 16);
+ sum_hi[0] = masked_sad_16x1_neon(sum_hi[0], s1, p1,
+ vld1q_u8(ref[0] + ref_offset + 16), m1);
+ sum_hi[1] = masked_sad_16x1_neon(sum_hi[1], s1, p1,
+ vld1q_u8(ref[1] + ref_offset + 16), m1);
+ sum_hi[2] = masked_sad_16x1_neon(sum_hi[2], s1, p1,
+ vld1q_u8(ref[2] + ref_offset + 16), m1);
+ sum_hi[3] = masked_sad_16x1_neon(sum_hi[3], s1, p1,
+ vld1q_u8(ref[3] + ref_offset + 16), m1);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 32;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
+}
+
+static INLINE void masked_sad32xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t m0 = vld1q_u8(mask);
+ sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0,
+ vld1q_u8(ref[0] + ref_offset), p0, m0);
+ sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0,
+ vld1q_u8(ref[1] + ref_offset), p0, m0);
+ sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0,
+ vld1q_u8(ref[2] + ref_offset), p0, m0);
+ sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0,
+ vld1q_u8(ref[3] + ref_offset), p0, m0);
+
+ uint8x16_t s1 = vld1q_u8(src + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t m1 = vld1q_u8(mask + 16);
+ sum_hi[0] = masked_sad_16x1_neon(
+ sum_hi[0], s1, vld1q_u8(ref[0] + ref_offset + 16), p1, m1);
+ sum_hi[1] = masked_sad_16x1_neon(
+ sum_hi[1], s1, vld1q_u8(ref[1] + ref_offset + 16), p1, m1);
+ sum_hi[2] = masked_sad_16x1_neon(
+ sum_hi[2], s1, vld1q_u8(ref[2] + ref_offset + 16), p1, m1);
+ sum_hi[3] = masked_sad_16x1_neon(
+ sum_hi[3], s1, vld1q_u8(ref[3] + ref_offset + 16), p1, m1);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 32;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
+}
+
+static INLINE void masked_inv_sad16xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t m0 = vld1q_u8(mask);
+ sum_u16[0] = masked_sad_16x1_neon(sum_u16[0], s0, p0,
+ vld1q_u8(ref[0] + ref_offset), m0);
+ sum_u16[1] = masked_sad_16x1_neon(sum_u16[1], s0, p0,
+ vld1q_u8(ref[1] + ref_offset), m0);
+ sum_u16[2] = masked_sad_16x1_neon(sum_u16[2], s0, p0,
+ vld1q_u8(ref[2] + ref_offset), m0);
+ sum_u16[3] = masked_sad_16x1_neon(sum_u16[3], s0, p0,
+ vld1q_u8(ref[3] + ref_offset), m0);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 16;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ sum_u32[0] = vpaddlq_u16(sum_u16[0]);
+ sum_u32[1] = vpaddlq_u16(sum_u16[1]);
+ sum_u32[2] = vpaddlq_u16(sum_u16[2]);
+ sum_u32[3] = vpaddlq_u16(sum_u16[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void masked_sad16xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t m0 = vld1q_u8(mask);
+ sum_u16[0] = masked_sad_16x1_neon(sum_u16[0], s0,
+ vld1q_u8(ref[0] + ref_offset), p0, m0);
+ sum_u16[1] = masked_sad_16x1_neon(sum_u16[1], s0,
+ vld1q_u8(ref[1] + ref_offset), p0, m0);
+ sum_u16[2] = masked_sad_16x1_neon(sum_u16[2], s0,
+ vld1q_u8(ref[2] + ref_offset), p0, m0);
+ sum_u16[3] = masked_sad_16x1_neon(sum_u16[3], s0,
+ vld1q_u8(ref[3] + ref_offset), p0, m0);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 16;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ sum_u32[0] = vpaddlq_u16(sum_u16[0]);
+ sum_u32[1] = vpaddlq_u16(sum_u16[1]);
+ sum_u32[2] = vpaddlq_u16(sum_u16[2]);
+ sum_u32[3] = vpaddlq_u16(sum_u16[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad, const uint8x8_t s0,
+ const uint8x8_t a0,
+ const uint8x8_t b0,
+ const uint8x8_t m0) {
+ uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
+ uint16x8_t blend_u16 = vmull_u8(m0, a0);
+ blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
+
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+ return vabal_u8(sad, blend_u8, s0);
+}
+
+static INLINE void masked_inv_sad8xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t p0 = vld1_u8(second_pred);
+ uint8x8_t m0 = vld1_u8(mask);
+ sum[0] =
+ masked_sad_8x1_neon(sum[0], s0, p0, vld1_u8(ref[0] + ref_offset), m0);
+ sum[1] =
+ masked_sad_8x1_neon(sum[1], s0, p0, vld1_u8(ref[1] + ref_offset), m0);
+ sum[2] =
+ masked_sad_8x1_neon(sum[2], s0, p0, vld1_u8(ref[2] + ref_offset), m0);
+ sum[3] =
+ masked_sad_8x1_neon(sum[3], s0, p0, vld1_u8(ref[3] + ref_offset), m0);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 8;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void masked_sad8xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ uint8x8_t s0 = vld1_u8(src);
+ uint8x8_t p0 = vld1_u8(second_pred);
+ uint8x8_t m0 = vld1_u8(mask);
+
+ sum[0] =
+ masked_sad_8x1_neon(sum[0], s0, vld1_u8(ref[0] + ref_offset), p0, m0);
+ sum[1] =
+ masked_sad_8x1_neon(sum[1], s0, vld1_u8(ref[1] + ref_offset), p0, m0);
+ sum[2] =
+ masked_sad_8x1_neon(sum[2], s0, vld1_u8(ref[2] + ref_offset), p0, m0);
+ sum[3] =
+ masked_sad_8x1_neon(sum[3], s0, vld1_u8(ref[3] + ref_offset), p0, m0);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ second_pred += 8;
+ mask += mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void masked_inv_sad4xhx4d_neon(
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+ int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+ uint8x8_t r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
+ uint8x8_t p0 = vld1_u8(second_pred);
+ uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
+
+ sum[0] = masked_sad_8x1_neon(sum[0], s, p0, r0, m0);
+ sum[1] = masked_sad_8x1_neon(sum[1], s, p0, r1, m0);
+ sum[2] = masked_sad_8x1_neon(sum[2], s, p0, r2, m0);
+ sum[3] = masked_sad_8x1_neon(sum[3], s, p0, r3, m0);
+
+ src += 2 * src_stride;
+ ref_offset += 2 * ref_stride;
+ second_pred += 2 * 4;
+ mask += 2 * mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void masked_sad4xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+ uint8x8_t r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
+ uint8x8_t p0 = vld1_u8(second_pred);
+ uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
+
+ sum[0] = masked_sad_8x1_neon(sum[0], s, r0, p0, m0);
+ sum[1] = masked_sad_8x1_neon(sum[1], s, r1, p0, m0);
+ sum[2] = masked_sad_8x1_neon(sum[2], s, r2, p0, m0);
+ sum[3] = masked_sad_8x1_neon(sum[3], s, r3, p0, m0);
+
+ src += 2 * src_stride;
+ ref_offset += 2 * ref_stride;
+ second_pred += 2 * 4;
+ mask += 2 * mask_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+#define MASKED_SAD4D_WXH_NEON(w, h) \
+ void aom_masked_sad##w##x##h##x4d_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int invert_mask, uint32_t res[4]) { \
+ if (invert_mask) { \
+ masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, \
+ second_pred, msk, msk_stride, res, h); \
+ } else { \
+ masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, second_pred, \
+ msk, msk_stride, res, h); \
+ } \
+ }
+
+MASKED_SAD4D_WXH_NEON(4, 8)
+MASKED_SAD4D_WXH_NEON(4, 4)
+
+MASKED_SAD4D_WXH_NEON(8, 16)
+MASKED_SAD4D_WXH_NEON(8, 8)
+MASKED_SAD4D_WXH_NEON(8, 4)
+
+MASKED_SAD4D_WXH_NEON(16, 32)
+MASKED_SAD4D_WXH_NEON(16, 16)
+MASKED_SAD4D_WXH_NEON(16, 8)
+
+MASKED_SAD4D_WXH_NEON(32, 64)
+MASKED_SAD4D_WXH_NEON(32, 32)
+MASKED_SAD4D_WXH_NEON(32, 16)
+
+MASKED_SAD4D_WXH_NEON(64, 128)
+MASKED_SAD4D_WXH_NEON(64, 64)
+MASKED_SAD4D_WXH_NEON(64, 32)
+
+MASKED_SAD4D_WXH_NEON(128, 128)
+MASKED_SAD4D_WXH_NEON(128, 64)
+
+#if !CONFIG_REALTIME_ONLY
+MASKED_SAD4D_WXH_NEON(4, 16)
+MASKED_SAD4D_WXH_NEON(16, 4)
+MASKED_SAD4D_WXH_NEON(8, 32)
+MASKED_SAD4D_WXH_NEON(32, 8)
+MASKED_SAD4D_WXH_NEON(16, 64)
+MASKED_SAD4D_WXH_NEON(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/arm/masked_sad_neon.c b/third_party/aom/aom_dsp/arm/masked_sad_neon.c
new file mode 100644
index 0000000000..9d263105e3
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/masked_sad_neon.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/blend.h"
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+ const uint8_t *src,
+ const uint8_t *a,
+ const uint8_t *b,
+ const uint8_t *m) {
+ uint8x16_t m0 = vld1q_u8(m);
+ uint8x16_t a0 = vld1q_u8(a);
+ uint8x16_t b0 = vld1q_u8(b);
+ uint8x16_t s0 = vld1q_u8(src);
+
+ uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, a0, b0);
+
+ return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
+}
+
+static INLINE unsigned masked_sad_128xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // Eight accumulator vectors are required to avoid overflow in the 128x128
+ // case.
+ assert(height <= 128);
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ sad[0] = masked_sad_16x1_neon(sad[0], &src[0], &a[0], &b[0], &m[0]);
+ sad[1] = masked_sad_16x1_neon(sad[1], &src[16], &a[16], &b[16], &m[16]);
+ sad[2] = masked_sad_16x1_neon(sad[2], &src[32], &a[32], &b[32], &m[32]);
+ sad[3] = masked_sad_16x1_neon(sad[3], &src[48], &a[48], &b[48], &m[48]);
+ sad[4] = masked_sad_16x1_neon(sad[4], &src[64], &a[64], &b[64], &m[64]);
+ sad[5] = masked_sad_16x1_neon(sad[5], &src[80], &a[80], &b[80], &m[80]);
+ sad[6] = masked_sad_16x1_neon(sad[6], &src[96], &a[96], &b[96], &m[96]);
+ sad[7] = masked_sad_16x1_neon(sad[7], &src[112], &a[112], &b[112], &m[112]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ height--;
+ } while (height != 0);
+
+ return horizontal_long_add_u16x8(sad[0], sad[1]) +
+ horizontal_long_add_u16x8(sad[2], sad[3]) +
+ horizontal_long_add_u16x8(sad[4], sad[5]) +
+ horizontal_long_add_u16x8(sad[6], sad[7]);
+}
+
+static INLINE unsigned masked_sad_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // Four accumulator vectors are required to avoid overflow in the 64x128 case.
+ assert(height <= 128);
+ uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ sad[0] = masked_sad_16x1_neon(sad[0], &src[0], &a[0], &b[0], &m[0]);
+ sad[1] = masked_sad_16x1_neon(sad[1], &src[16], &a[16], &b[16], &m[16]);
+ sad[2] = masked_sad_16x1_neon(sad[2], &src[32], &a[32], &b[32], &m[32]);
+ sad[3] = masked_sad_16x1_neon(sad[3], &src[48], &a[48], &b[48], &m[48]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ height--;
+ } while (height != 0);
+
+ return horizontal_long_add_u16x8(sad[0], sad[1]) +
+ horizontal_long_add_u16x8(sad[2], sad[3]);
+}
+
+static INLINE unsigned masked_sad_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // We could use a single accumulator up to height=64 without overflow.
+ assert(height <= 64);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_16x1_neon(sad, &src[0], &a[0], &b[0], &m[0]);
+ sad = masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ height--;
+ } while (height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned masked_sad_16xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // We could use a single accumulator up to height=128 without overflow.
+ assert(height <= 128);
+ uint16x8_t sad = vdupq_n_u16(0);
+
+ do {
+ sad = masked_sad_16x1_neon(sad, src, a, b, m);
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ height--;
+ } while (height != 0);
+
+ return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned masked_sad_8xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // We could use a single accumulator up to height=128 without overflow.
+ assert(height <= 128);
+ uint16x4_t sad = vdup_n_u16(0);
+
+ do {
+ uint8x8_t m0 = vld1_u8(m);
+ uint8x8_t a0 = vld1_u8(a);
+ uint8x8_t b0 = vld1_u8(b);
+ uint8x8_t s0 = vld1_u8(src);
+
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
+
+ sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
+
+ src += src_stride;
+ a += a_stride;
+ b += b_stride;
+ m += m_stride;
+ height--;
+ } while (height != 0);
+
+ return horizontal_add_u16x4(sad);
+}
+
+static INLINE unsigned masked_sad_4xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ const uint8_t *m, int m_stride,
+ int height) {
+ // Process two rows per loop iteration.
+ assert(height % 2 == 0);
+
+ // We could use a single accumulator up to height=256 without overflow.
+ assert(height <= 256);
+ uint16x4_t sad = vdup_n_u16(0);
+
+ do {
+ uint8x8_t m0 = load_unaligned_u8(m, m_stride);
+ uint8x8_t a0 = load_unaligned_u8(a, a_stride);
+ uint8x8_t b0 = load_unaligned_u8(b, b_stride);
+ uint8x8_t s0 = load_unaligned_u8(src, src_stride);
+
+ uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
+
+ sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
+
+ src += 2 * src_stride;
+ a += 2 * a_stride;
+ b += 2 * b_stride;
+ m += 2 * m_stride;
+ height -= 2;
+ } while (height != 0);
+
+ return horizontal_add_u16x4(sad);
+}
+
+#define MASKED_SAD_WXH_NEON(width, height) \
+ unsigned aom_masked_sad##width##x##height##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_##width##xh_neon(src, src_stride, ref, ref_stride, \
+ second_pred, width, msk, msk_stride, \
+ height); \
+ else \
+ return masked_sad_##width##xh_neon(src, src_stride, second_pred, width, \
+ ref, ref_stride, msk, msk_stride, \
+ height); \
+ }
+
+MASKED_SAD_WXH_NEON(4, 4)
+MASKED_SAD_WXH_NEON(4, 8)
+MASKED_SAD_WXH_NEON(8, 4)
+MASKED_SAD_WXH_NEON(8, 8)
+MASKED_SAD_WXH_NEON(8, 16)
+MASKED_SAD_WXH_NEON(16, 8)
+MASKED_SAD_WXH_NEON(16, 16)
+MASKED_SAD_WXH_NEON(16, 32)
+MASKED_SAD_WXH_NEON(32, 16)
+MASKED_SAD_WXH_NEON(32, 32)
+MASKED_SAD_WXH_NEON(32, 64)
+MASKED_SAD_WXH_NEON(64, 32)
+MASKED_SAD_WXH_NEON(64, 64)
+MASKED_SAD_WXH_NEON(64, 128)
+MASKED_SAD_WXH_NEON(128, 64)
+MASKED_SAD_WXH_NEON(128, 128)
+#if !CONFIG_REALTIME_ONLY
+MASKED_SAD_WXH_NEON(4, 16)
+MASKED_SAD_WXH_NEON(16, 4)
+MASKED_SAD_WXH_NEON(8, 32)
+MASKED_SAD_WXH_NEON(32, 8)
+MASKED_SAD_WXH_NEON(16, 64)
+MASKED_SAD_WXH_NEON(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/arm/mem_neon.h b/third_party/aom/aom_dsp/arm/mem_neon.h
new file mode 100644
index 0000000000..52c7a34e3e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/mem_neon.h
@@ -0,0 +1,1253 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
+#define AOM_AOM_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <string.h>
+#include "aom_dsp/aom_dsp_common.h"
+
+// Support for xN Neon intrinsics is lacking in some compilers.
+#if defined(__arm__) || defined(_M_ARM)
+#define ARM_32_BIT
+#endif
+
+// DEFICIENT_CLANG_32_BIT includes clang-cl.
+#if defined(__clang__) && defined(ARM_32_BIT) && \
+ (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
+#define DEFICIENT_CLANG_32_BIT // This includes clang-cl.
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
+#define GCC_32_BIT
+#endif
+
+#if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
+
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+ uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+ vld1q_u8(ptr + 2 * 16) } };
+ return res;
+}
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+ uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+ return res;
+}
+
+static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
+ uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
+ return res;
+}
+
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+ uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+ vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+ return res;
+}
+
+#elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit.
+#if __GNUC__ < 8
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+ uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+ return res;
+}
+
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+ uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+ vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+ return res;
+}
+#endif // __GNUC__ < 8
+
+#if __GNUC__ < 9
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+ uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+ vld1q_u8(ptr + 2 * 16) } };
+ return res;
+}
+#endif // __GNUC__ < 9
+#endif // defined(__GNUC__) && !defined(__clang__)
+
+static INLINE void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+ const uint8x8_t s1) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+}
+
+static INLINE uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
+ return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
+}
+
+// Load four bytes into the low half of a uint8x8_t, zero the upper half.
+static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) {
+ uint8x8_t ret = vdup_n_u8(0);
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+ return ret;
+}
+
+static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
+ uint8x8_t ret = vdup_n_u8(0);
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+ p += stride;
+ ret = vreinterpret_u8_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
+ return ret;
+}
+
+static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
+ uint16x4_t ret = vdup_n_u16(0);
+ ret = vreinterpret_u16_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
+ p += stride;
+ ret = vreinterpret_u16_u32(
+ vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
+ return ret;
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x4_t *const s0, uint16x4_t *const s1,
+ uint16x4_t *const s2, uint16x4_t *const s3) {
+ *s0 = vld1_u16(s);
+ s += p;
+ *s1 = vld1_u16(s);
+ s += p;
+ *s2 = vld1_u16(s);
+ s += p;
+ *s3 = vld1_u16(s);
+ s += p;
+}
+
+static INLINE void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
+ uint16x4_t *const s0, uint16x4_t *const s1,
+ uint16x4_t *const s2, uint16x4_t *const s3,
+ uint16x4_t *const s4, uint16x4_t *const s5,
+ uint16x4_t *const s6) {
+ *s0 = vld1_u16(s);
+ s += p;
+ *s1 = vld1_u16(s);
+ s += p;
+ *s2 = vld1_u16(s);
+ s += p;
+ *s3 = vld1_u16(s);
+ s += p;
+ *s4 = vld1_u16(s);
+ s += p;
+ *s5 = vld1_u16(s);
+ s += p;
+ *s6 = vld1_u16(s);
+}
+
+static INLINE void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+}
+
+static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6, int16x4_t *const s7,
+ int16x4_t *const s8, int16x4_t *const s9,
+ int16x4_t *const s10, int16x4_t *const s11) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+ s += p;
+ *s8 = vld1_s16(s);
+ s += p;
+ *s9 = vld1_s16(s);
+ s += p;
+ *s10 = vld1_s16(s);
+ s += p;
+ *s11 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6, int16x4_t *const s7,
+ int16x4_t *const s8, int16x4_t *const s9,
+ int16x4_t *const s10) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+ s += p;
+ *s8 = vld1_s16(s);
+ s += p;
+ *s9 = vld1_s16(s);
+ s += p;
+ *s10 = vld1_s16(s);
+}
+
+static INLINE void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
+ uint16x4_t *const s0, uint16x4_t *const s1,
+ uint16x4_t *const s2, uint16x4_t *const s3,
+ uint16x4_t *const s4, uint16x4_t *const s5,
+ uint16x4_t *const s6, uint16x4_t *const s7,
+ uint16x4_t *const s8, uint16x4_t *const s9,
+ uint16x4_t *const s10) {
+ *s0 = vld1_u16(s);
+ s += p;
+ *s1 = vld1_u16(s);
+ s += p;
+ *s2 = vld1_u16(s);
+ s += p;
+ *s3 = vld1_u16(s);
+ s += p;
+ *s4 = vld1_u16(s);
+ s += p;
+ *s5 = vld1_u16(s);
+ s += p;
+ *s6 = vld1_u16(s);
+ s += p;
+ *s7 = vld1_u16(s);
+ s += p;
+ *s8 = vld1_u16(s);
+ s += p;
+ *s9 = vld1_u16(s);
+ s += p;
+ *s10 = vld1_u16(s);
+}
+
+static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6, int16x4_t *const s7) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x7(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+}
+
+static INLINE void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
+ uint16x4_t *const s0, uint16x4_t *const s1,
+ uint16x4_t *const s2, uint16x4_t *const s3,
+ uint16x4_t *const s4) {
+ *s0 = vld1_u16(s);
+ s += p;
+ *s1 = vld1_u16(s);
+ s += p;
+ *s2 = vld1_u16(s);
+ s += p;
+ *s3 = vld1_u16(s);
+ s += p;
+ *s4 = vld1_u16(s);
+ s += p;
+}
+
+static INLINE void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+}
+
+static INLINE void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3,
+ uint16x8_t *const s4) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+ const uint8x8_t s1, const uint8x8_t s2,
+ const uint8x8_t s3, const uint8x8_t s4,
+ const uint8x8_t s5, const uint8x8_t s6,
+ const uint8x8_t s7) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+ s += p;
+ vst1_u8(s, s4);
+ s += p;
+ vst1_u8(s, s5);
+ s += p;
+ vst1_u8(s, s6);
+ s += p;
+ vst1_u8(s, s7);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+ const uint8x8_t s1, const uint8x8_t s2,
+ const uint8x8_t s3) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
+ const uint8x16_t s1, const uint8x16_t s2,
+ const uint8x16_t s3) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+}
+
+static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3,
+ const uint16x8_t s4, const uint16x8_t s5,
+ const uint16x8_t s6, const uint16x8_t s7) {
+ vst1q_u16(s, s0);
+ s += dst_stride;
+ vst1q_u16(s, s1);
+ s += dst_stride;
+ vst1q_u16(s, s2);
+ s += dst_stride;
+ vst1q_u16(s, s3);
+ s += dst_stride;
+ vst1q_u16(s, s4);
+ s += dst_stride;
+ vst1q_u16(s, s5);
+ s += dst_stride;
+ vst1q_u16(s, s6);
+ s += dst_stride;
+ vst1q_u16(s, s7);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x4_t s0, const uint16x4_t s1,
+ const uint16x4_t s2, const uint16x4_t s3) {
+ vst1_u16(s, s0);
+ s += dst_stride;
+ vst1_u16(s, s1);
+ s += dst_stride;
+ vst1_u16(s, s2);
+ s += dst_stride;
+ vst1_u16(s, s3);
+}
+
+static INLINE void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x8_t s0, const uint16x8_t s1) {
+ vst1q_u16(s, s0);
+ s += dst_stride;
+ vst1q_u16(s, s1);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3) {
+ vst1q_u16(s, s0);
+ s += dst_stride;
+ vst1q_u16(s, s1);
+ s += dst_stride;
+ vst1q_u16(s, s2);
+ s += dst_stride;
+ vst1q_u16(s, s3);
+}
+
+static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
+ const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7) {
+ vst1q_s16(s, s0);
+ s += dst_stride;
+ vst1q_s16(s, s1);
+ s += dst_stride;
+ vst1q_s16(s, s2);
+ s += dst_stride;
+ vst1q_s16(s, s3);
+ s += dst_stride;
+ vst1q_s16(s, s4);
+ s += dst_stride;
+ vst1q_s16(s, s5);
+ s += dst_stride;
+ vst1q_s16(s, s6);
+ s += dst_stride;
+ vst1q_s16(s, s7);
+}
+
+static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+ const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3) {
+ vst1_s16(s, s0);
+ s += dst_stride;
+ vst1_s16(s, s1);
+ s += dst_stride;
+ vst1_s16(s, s2);
+ s += dst_stride;
+ vst1_s16(s, s3);
+}
+
+static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+ const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3) {
+ vst1q_s16(s, s0);
+ s += dst_stride;
+ vst1q_s16(s, s1);
+ s += dst_stride;
+ vst1q_s16(s, s2);
+ s += dst_stride;
+ vst1q_s16(s, s3);
+}
+
+static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7,
+ uint8x8_t *const s8, uint8x8_t *const s9,
+ uint8x8_t *const s10) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+ s += p;
+ *s8 = vld1_u8(s);
+ s += p;
+ *s9 = vld1_u8(s);
+ s += p;
+ *s10 = vld1_u8(s);
+}
+
+static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7,
+ int16x8_t *const s8, int16x8_t *const s9) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+ s += p;
+ *s8 = vld1q_s16(s);
+ s += p;
+ *s9 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7,
+ int16x8_t *const s8, int16x8_t *const s9,
+ int16x8_t *const s10) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+ s += p;
+ *s8 = vld1q_s16(s);
+ s += p;
+ *s9 = vld1q_s16(s);
+ s += p;
+ *s10 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7,
+ int16x8_t *const s8, int16x8_t *const s9,
+ int16x8_t *const s10, int16x8_t *const s11) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+ s += p;
+ *s8 = vld1q_s16(s);
+ s += p;
+ *s9 = vld1q_s16(s);
+ s += p;
+ *s10 = vld1q_s16(s);
+ s += p;
+ *s11 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3,
+ uint16x8_t *const s4, uint16x8_t *const s5,
+ uint16x8_t *const s6, uint16x8_t *const s7,
+ uint16x8_t *const s8, uint16x8_t *const s9,
+ uint16x8_t *const s10) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+ s += p;
+ *s7 = vld1q_u16(s);
+ s += p;
+ *s8 = vld1q_u16(s);
+ s += p;
+ *s9 = vld1q_u16(s);
+ s += p;
+ *s10 = vld1q_u16(s);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3,
+ uint16x8_t *const s4, uint16x8_t *const s5,
+ uint16x8_t *const s6) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+}
+
+static INLINE void load_s16_8x7(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+}
+
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+ uint32_t a;
+ memcpy(&a, buf, 4);
+ buf += stride;
+ uint32x2_t a_u32 = vdup_n_u32(a);
+ memcpy(&a, buf, 4);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+ uint32_t a;
+ uint32x4_t a_u32;
+ if (stride == 4) return vld1q_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdupq_n_u32(a);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vsetq_lane_u32(a, a_u32, 2);
+ memcpy(&a, buf, 4);
+ a_u32 = vsetq_lane_u32(a, a_u32, 3);
+ return vreinterpretq_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
+ uint16_t a;
+ uint16x4_t a_u16;
+
+ memcpy(&a, buf, 2);
+ buf += stride;
+ a_u16 = vdup_n_u16(a);
+ memcpy(&a, buf, 2);
+ a_u16 = vset_lane_u16(a, a_u16, 1);
+ return vreinterpret_u8_u16(a_u16);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ a_u32 = vdup_n_u32(0);
+ a_u32 = vset_lane_u32(a, a_u32, 0);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ a_u32 = vdup_n_u32(a);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
+ uint16_t a;
+ uint16x4_t a_u32;
+
+ memcpy(&a, buf, 2);
+ a_u32 = vdup_n_u16(a);
+ return vreinterpret_u8_u16(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdup_n_u32(a);
+ memcpy(&a, buf, 4);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
+ uint8x8_t *tu0, uint8x8_t *tu1) {
+ *tu0 = load_unaligned_u8_4x2(buf, stride);
+ buf += 2 * stride;
+ *tu1 = load_unaligned_u8_4x2(buf, stride);
+}
+
+static INLINE void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
+ uint8x8_t *tu0, uint8x8_t *tu1,
+ uint8x8_t *tu2) {
+ load_unaligned_u8_4x4(buf, stride, tu0, tu1);
+ buf += 4 * stride;
+ *tu2 = load_unaligned_u8_4x2(buf, stride);
+}
+
+static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+ uint8x8_t *tu0, uint8x8_t *tu1,
+ uint8x8_t *tu2, uint8x8_t *tu3) {
+ load_unaligned_u8_4x4(buf, stride, tu0, tu1);
+ buf += 4 * stride;
+ load_unaligned_u8_4x4(buf, stride, tu2, tu3);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3,
+ uint8x16_t *const s4, uint8x16_t *const s5,
+ uint8x16_t *const s6, uint8x16_t *const s7) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+ s += p;
+ *s4 = vld1q_u8(s);
+ s += p;
+ *s5 = vld1q_u8(s);
+ s += p;
+ *s6 = vld1q_u8(s);
+ s += p;
+ *s7 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+ uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+ uint16x8_t *s6, uint16x8_t *s7) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+ s += p;
+ *s7 = vld1q_u16(s);
+}
+
+static INLINE void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3,
+ uint16x8_t *const s4, uint16x8_t *const s5,
+ uint16x8_t *const s6, uint16x8_t *const s7) {
+ *s0 = vld1q_u16(s);
+ *s1 = vld1q_u16(s + 8);
+ s += p;
+ *s2 = vld1q_u16(s);
+ *s3 = vld1q_u16(s + 8);
+ s += p;
+ *s4 = vld1q_u16(s);
+ *s5 = vld1q_u16(s + 8);
+ s += p;
+ *s6 = vld1q_u16(s);
+ *s7 = vld1q_u16(s + 8);
+}
+
+static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
+ int stride) {
+ uint32_t a;
+ uint32x2_t a_u32;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vdup_n_u32(a);
+ memcpy(&a, buf, 4);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
+ return vreinterpret_u16_u32(a_u32);
+}
+
+static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
+ uint64_t a;
+ uint64x1_t a_u64 = vdup_n_u64(0);
+ memcpy(&a, buf, 8);
+ a_u64 = vset_lane_u64(a, a_u64, 0);
+ return vreinterpret_u16_u64(a_u64);
+}
+
+static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
+ uint32_t stride) {
+ uint64_t a;
+ uint64x2_t a_u64;
+
+ memcpy(&a, buf, 8);
+ buf += stride;
+ a_u64 = vdupq_n_u64(0);
+ a_u64 = vsetq_lane_u64(a, a_u64, 0);
+ memcpy(&a, buf, 8);
+ buf += stride;
+ a_u64 = vsetq_lane_u64(a, a_u64, 1);
+ return vreinterpretq_u16_u64(a_u64);
+}
+
+static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
+ uint16x8_t *tu0, uint16x8_t *tu1) {
+ *tu0 = load_unaligned_u16_4x2(buf, stride);
+ buf += 2 * stride;
+ *tu1 = load_unaligned_u16_4x2(buf, stride);
+}
+
+static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+ int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
+ *s1 = vld1q_s32(s);
+ s += p;
+ *s2 = vld1q_s32(s);
+ s += p;
+ *s3 = vld1q_s32(s);
+ s += p;
+ *s4 = vld1q_s32(s);
+}
+
+static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+ int32x4_t s2, int32x4_t s3, int32x4_t s4) {
+ vst1q_s32(s, s1);
+ s += p;
+ vst1q_s32(s, s2);
+ s += p;
+ vst1q_s32(s, s3);
+ s += p;
+ vst1q_s32(s, s4);
+}
+
+static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+ uint32x4_t *s2, uint32x4_t *s3,
+ uint32x4_t *s4) {
+ *s1 = vld1q_u32(s);
+ s += p;
+ *s2 = vld1q_u32(s);
+ s += p;
+ *s3 = vld1q_u32(s);
+ s += p;
+ *s4 = vld1q_u32(s);
+}
+
+static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+ uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
+ vst1q_u32(s, s1);
+ s += p;
+ vst1q_u32(s, s2);
+ s += p;
+ vst1q_u32(s, s3);
+ s += p;
+ vst1q_u32(s, s4);
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+ const int32x4_t v0 = vld1q_s32(buf);
+ const int32x4_t v1 = vld1q_s32(buf + 4);
+ const int16x4_t s0 = vmovn_s32(v0);
+ const int16x4_t s1 = vmovn_s32(v1);
+ return vcombine_s16(s0, s1);
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+ const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+ const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+ vst1q_s32(buf, v0);
+ vst1q_s32(buf + 4, v1);
+}
+
+static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
+ const int32x4_t v0 = vmovl_s16(a);
+ vst1q_s32(buf, v0);
+}
+
+static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
+ int16x8_t indices) {
+ // Recent Clang and GCC versions correctly identify that this zero-broadcast
+ // is redundant. Alternatively we could load and broadcast the zeroth element
+ // and then replace the other lanes, however this is slower than loading a
+ // single element without broadcast on some micro-architectures.
+ uint8x8_t ret = vdup_n_u8(0);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
+ ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
+ return ret;
+}
+
+// The `lane` parameter here must be an immediate.
+#define store_u8_2x1_lane(dst, src, lane) \
+ do { \
+ uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
+ memcpy(dst, &a, 2); \
+ } while (0)
+
+#define store_u8_4x1_lane(dst, src, lane) \
+ do { \
+ uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
+ memcpy(dst, &a, 4); \
+ } while (0)
+
+#define store_u16_2x1_lane(dst, src, lane) \
+ do { \
+ uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
+ memcpy(dst, &a, 4); \
+ } while (0)
+
+#define store_u16_4x1_lane(dst, src, lane) \
+ do { \
+ uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
+ memcpy(dst, &a, 8); \
+ } while (0)
+
+// Store the low 16-bits from a single vector.
+static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
+ store_u8_2x1_lane(dst, src, 0);
+}
+
+// Store the low 32-bits from a single vector.
+static INLINE void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
+ store_u8_4x1_lane(dst, src, 0);
+}
+
+// Store two blocks of 16-bits from a single vector.
+static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
+ uint8x8_t src) {
+ store_u8_2x1_lane(dst, src, 0);
+ dst += dst_stride;
+ store_u8_2x1_lane(dst, src, 1);
+}
+
+// Store two blocks of 32-bits from a single vector.
+static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
+ uint8x8_t src) {
+ store_u8_4x1_lane(dst, src, 0);
+ dst += stride;
+ store_u8_4x1_lane(dst, src, 1);
+}
+
+// Store four blocks of 32-bits from a single vector.
+static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
+ uint8x16_t src) {
+ store_u8_4x1_lane(dst, vget_low_u8(src), 0);
+ dst += stride;
+ store_u8_4x1_lane(dst, vget_low_u8(src), 1);
+ dst += stride;
+ store_u8_4x1_lane(dst, vget_high_u8(src), 0);
+ dst += stride;
+ store_u8_4x1_lane(dst, vget_high_u8(src), 1);
+}
+
+// Store the low 32-bits from a single vector.
+static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
+ store_u16_2x1_lane(dst, src, 0);
+}
+
+// Store two blocks of 32-bits from a single vector.
+static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
+ uint16x4_t src) {
+ store_u16_2x1_lane(dst, src, 0);
+ dst += dst_stride;
+ store_u16_2x1_lane(dst, src, 1);
+}
+
+// Store two blocks of 64-bits from a single vector.
+static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
+ uint16x8_t src) {
+ store_u16_4x1_lane(dst, src, 0);
+ dst += dst_stride;
+ store_u16_4x1_lane(dst, src, 1);
+}
+
+#undef store_u8_2x1_lane
+#undef store_u8_4x1_lane
+#undef store_u16_2x1_lane
+#undef store_u16_4x1_lane
+
+#endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/obmc_sad_neon.c b/third_party/aom/aom_dsp/arm/obmc_sad_neon.c
new file mode 100644
index 0000000000..a692cbb388
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/obmc_sad_neon.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+
+static INLINE void obmc_sad_8x1_s16_neon(int16x8_t ref_s16, const int32_t *mask,
+ const int32_t *wsrc, uint32x4_t *sum) {
+ int32x4_t wsrc_lo = vld1q_s32(wsrc);
+ int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+
+ int32x4_t mask_lo = vld1q_s32(mask);
+ int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+ int16x8_t mask_s16 =
+ vuzpq_s16(vreinterpretq_s16_s32(mask_lo), vreinterpretq_s16_s32(mask_hi))
+ .val[0];
+
+ int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
+ int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
+
+ uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+ uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+ *sum = vrsraq_n_u32(*sum, abs_lo, 12);
+ *sum = vrsraq_n_u32(*sum, abs_hi, 12);
+}
+
+#if AOM_ARCH_AARCH64
+
+// Use tbl for doing a double-width zero extension from 8->32 bits since we can
+// do this in one instruction rather than two (indices out of range (255 here)
+// are set to zero by tbl).
+DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = {
+ 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255,
+ 4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255,
+ 8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255,
+ 12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
+};
+
+static INLINE void obmc_sad_8x1_s32_neon(uint32x4_t ref_u32_lo,
+ uint32x4_t ref_u32_hi,
+ const int32_t *mask,
+ const int32_t *wsrc,
+ uint32x4_t sum[2]) {
+ int32x4_t wsrc_lo = vld1q_s32(wsrc);
+ int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+ int32x4_t mask_lo = vld1q_s32(mask);
+ int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+ int32x4_t pre_lo = vmulq_s32(vreinterpretq_s32_u32(ref_u32_lo), mask_lo);
+ int32x4_t pre_hi = vmulq_s32(vreinterpretq_s32_u32(ref_u32_hi), mask_hi);
+
+ uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+ uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+ sum[0] = vrsraq_n_u32(sum[0], abs_lo, 12);
+ sum[1] = vrsraq_n_u32(sum[1], abs_hi, 12);
+}
+
+static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int width,
+ int height) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ // Use tbl for doing a double-width zero extension from 8->32 bits since we
+ // can do this in one instruction rather than two.
+ uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]);
+ uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]);
+ uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]);
+ uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]);
+
+ int h = height;
+ do {
+ int w = width;
+ const uint8_t *ref_ptr = ref;
+ do {
+ uint8x16_t r = vld1q_u8(ref_ptr);
+
+ uint32x4_t ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx0));
+ uint32x4_t ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx1));
+ obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask, wsrc, sum);
+
+ ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx2));
+ ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx3));
+ obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask + 8, wsrc + 8, sum);
+
+ ref_ptr += 16;
+ wsrc += 16;
+ mask += 16;
+ w -= 16;
+ } while (w != 0);
+
+ ref += ref_stride;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#else // !AOM_ARCH_AARCH64
+
+static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int width,
+ int height) {
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ const uint8_t *ref_ptr = ref;
+ do {
+ uint8x16_t r = vld1q_u8(ref_ptr);
+
+ int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r)));
+ obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum);
+
+ ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r)));
+ obmc_sad_8x1_s16_neon(ref_s16, mask + 8, wsrc + 8, &sum);
+
+ ref_ptr += 16;
+ wsrc += 16;
+ mask += 16;
+ w -= 16;
+ } while (w != 0);
+
+ ref += ref_stride;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+#endif // AOM_ARCH_AARCH64
+
+static INLINE unsigned int obmc_sad_128xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h) {
+ return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
+}
+
+static INLINE unsigned int obmc_sad_64xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h) {
+ return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
+}
+
+static INLINE unsigned int obmc_sad_32xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h) {
+ return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 32, h);
+}
+
+static INLINE unsigned int obmc_sad_16xh_neon(const uint8_t *ref,
+ int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h) {
+ return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
+}
+
+static INLINE unsigned int obmc_sad_8xh_neon(const uint8_t *ref, int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int height) {
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int h = height;
+ do {
+ uint8x8_t r = vld1_u8(ref);
+
+ int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r));
+ obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum);
+
+ ref += ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int obmc_sad_4xh_neon(const uint8_t *ref, int ref_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int height) {
+ uint32x4_t sum = vdupq_n_u32(0);
+
+ int h = height / 2;
+ do {
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+ int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r));
+ obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum);
+
+ ref += 2 * ref_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u32x4(sum);
+}
+
+#define OBMC_SAD_WXH_NEON(w, h) \
+ unsigned int aom_obmc_sad##w##x##h##_neon( \
+ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ return obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
+ }
+
+OBMC_SAD_WXH_NEON(4, 4)
+OBMC_SAD_WXH_NEON(4, 8)
+OBMC_SAD_WXH_NEON(4, 16)
+
+OBMC_SAD_WXH_NEON(8, 4)
+OBMC_SAD_WXH_NEON(8, 8)
+OBMC_SAD_WXH_NEON(8, 16)
+OBMC_SAD_WXH_NEON(8, 32)
+
+OBMC_SAD_WXH_NEON(16, 4)
+OBMC_SAD_WXH_NEON(16, 8)
+OBMC_SAD_WXH_NEON(16, 16)
+OBMC_SAD_WXH_NEON(16, 32)
+OBMC_SAD_WXH_NEON(16, 64)
+
+OBMC_SAD_WXH_NEON(32, 8)
+OBMC_SAD_WXH_NEON(32, 16)
+OBMC_SAD_WXH_NEON(32, 32)
+OBMC_SAD_WXH_NEON(32, 64)
+
+OBMC_SAD_WXH_NEON(64, 16)
+OBMC_SAD_WXH_NEON(64, 32)
+OBMC_SAD_WXH_NEON(64, 64)
+OBMC_SAD_WXH_NEON(64, 128)
+
+OBMC_SAD_WXH_NEON(128, 64)
+OBMC_SAD_WXH_NEON(128, 128)
diff --git a/third_party/aom/aom_dsp/arm/obmc_variance_neon.c b/third_party/aom/aom_dsp/arm/obmc_variance_neon.c
new file mode 100644
index 0000000000..50cd5f3b6a
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/obmc_variance_neon.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+
+static INLINE void obmc_variance_8x1_s16_neon(int16x8_t pre_s16,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ int32x4_t *ssev,
+ int32x4_t *sumv) {
+ // For 4xh and 8xh we observe it is faster to avoid the double-widening of
+ // pre. Instead we do a single widening step and narrow the mask to 16-bits
+ // to allow us to perform a widening multiply. Widening multiply
+ // instructions have better throughput on some micro-architectures but for
+ // the larger block sizes this benefit is outweighed by the additional
+ // instruction needed to first narrow the mask vectors.
+
+ int32x4_t wsrc_s32_lo = vld1q_s32(&wsrc[0]);
+ int32x4_t wsrc_s32_hi = vld1q_s32(&wsrc[4]);
+ int16x8_t mask_s16 = vuzpq_s16(vreinterpretq_s16_s32(vld1q_s32(&mask[0])),
+ vreinterpretq_s16_s32(vld1q_s32(&mask[4])))
+ .val[0];
+
+ int32x4_t diff_s32_lo =
+ vmlsl_s16(wsrc_s32_lo, vget_low_s16(pre_s16), vget_low_s16(mask_s16));
+ int32x4_t diff_s32_hi =
+ vmlsl_s16(wsrc_s32_hi, vget_high_s16(pre_s16), vget_high_s16(mask_s16));
+
+ // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
+ // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
+ // This difference only affects the bit patterns at the rounding breakpoints
+ // exactly, so we can add -1 to all negative numbers to move the breakpoint
+ // one value across and into the correct rounding region.
+ diff_s32_lo = vsraq_n_s32(diff_s32_lo, diff_s32_lo, 31);
+ diff_s32_hi = vsraq_n_s32(diff_s32_hi, diff_s32_hi, 31);
+ int32x4_t round_s32_lo = vrshrq_n_s32(diff_s32_lo, 12);
+ int32x4_t round_s32_hi = vrshrq_n_s32(diff_s32_hi, 12);
+
+ *sumv = vrsraq_n_s32(*sumv, diff_s32_lo, 12);
+ *sumv = vrsraq_n_s32(*sumv, diff_s32_hi, 12);
+ *ssev = vmlaq_s32(*ssev, round_s32_lo, round_s32_lo);
+ *ssev = vmlaq_s32(*ssev, round_s32_hi, round_s32_hi);
+}
+
+#if AOM_ARCH_AARCH64
+
+// Use tbl for doing a double-width zero extension from 8->32 bits since we can
+// do this in one instruction rather than two (indices out of range (255 here)
+// are set to zero by tbl).
+DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = {
+ 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255,
+ 4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255,
+ 8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255,
+ 12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
+};
+
+static INLINE void obmc_variance_8x1_s32_neon(
+ int32x4_t pre_lo, int32x4_t pre_hi, const int32_t *wsrc,
+ const int32_t *mask, int32x4_t *ssev, int32x4_t *sumv) {
+ int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
+ int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
+ int32x4_t mask_lo = vld1q_s32(&mask[0]);
+ int32x4_t mask_hi = vld1q_s32(&mask[4]);
+
+ int32x4_t diff_lo = vmlsq_s32(wsrc_lo, pre_lo, mask_lo);
+ int32x4_t diff_hi = vmlsq_s32(wsrc_hi, pre_hi, mask_hi);
+
+ // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away from
+ // zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. This
+ // difference only affects the bit patterns at the rounding breakpoints
+ // exactly, so we can add -1 to all negative numbers to move the breakpoint
+ // one value across and into the correct rounding region.
+ diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
+ diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
+ int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
+ int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
+
+ *sumv = vrsraq_n_s32(*sumv, diff_lo, 12);
+ *sumv = vrsraq_n_s32(*sumv, diff_hi, 12);
+ *ssev = vmlaq_s32(*ssev, round_lo, round_lo);
+ *ssev = vmlaq_s32(*ssev, round_hi, round_hi);
+}
+
+static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int width,
+ int height, unsigned *sse,
+ int *sum) {
+ assert(width % 16 == 0);
+
+ // Use tbl for doing a double-width zero extension from 8->32 bits since we
+ // can do this in one instruction rather than two.
+ uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]);
+ uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]);
+ uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]);
+ uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]);
+
+ int32x4_t ssev = vdupq_n_s32(0);
+ int32x4_t sumv = vdupq_n_s32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ do {
+ uint8x16_t pre_u8 = vld1q_u8(pre);
+
+ int32x4_t pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx0));
+ int32x4_t pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx1));
+ obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[0], &mask[0],
+ &ssev, &sumv);
+
+ pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx2));
+ pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx3));
+ obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[8], &mask[8],
+ &ssev, &sumv);
+
+ wsrc += 16;
+ mask += 16;
+ pre += 16;
+ w -= 16;
+ } while (w != 0);
+
+ pre += pre_stride - width;
+ } while (--h != 0);
+
+ *sse = horizontal_add_s32x4(ssev);
+ *sum = horizontal_add_s32x4(sumv);
+}
+
+#else // !AOM_ARCH_AARCH64
+
+static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int width,
+ int height, unsigned *sse,
+ int *sum) {
+ // Non-aarch64 targets do not have a 128-bit tbl instruction, so use the
+ // widening version of the core kernel instead.
+
+ assert(width % 16 == 0);
+
+ int32x4_t ssev = vdupq_n_s32(0);
+ int32x4_t sumv = vdupq_n_s32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ do {
+ uint8x16_t pre_u8 = vld1q_u8(pre);
+
+ int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pre_u8)));
+ obmc_variance_8x1_s16_neon(pre_s16, &wsrc[0], &mask[0], &ssev, &sumv);
+
+ pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pre_u8)));
+ obmc_variance_8x1_s16_neon(pre_s16, &wsrc[8], &mask[8], &ssev, &sumv);
+
+ wsrc += 16;
+ mask += 16;
+ pre += 16;
+ w -= 16;
+ } while (w != 0);
+
+ pre += pre_stride - width;
+ } while (--h != 0);
+
+ *sse = horizontal_add_s32x4(ssev);
+ *sum = horizontal_add_s32x4(sumv);
+}
+
+#endif // AOM_ARCH_AARCH64
+
+static INLINE void obmc_variance_neon_128xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_64xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_32xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_16xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_8xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ int32x4_t ssev = vdupq_n_s32(0);
+ int32x4_t sumv = vdupq_n_s32(0);
+
+ do {
+ uint8x8_t pre_u8 = vld1_u8(pre);
+ int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8));
+
+ obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv);
+
+ pre += pre_stride;
+ wsrc += 8;
+ mask += 8;
+ } while (--h != 0);
+
+ *sse = horizontal_add_s32x4(ssev);
+ *sum = horizontal_add_s32x4(sumv);
+}
+
+static INLINE void obmc_variance_neon_4xh(const uint8_t *pre, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int h,
+ unsigned *sse, int *sum) {
+ assert(h % 2 == 0);
+
+ int32x4_t ssev = vdupq_n_s32(0);
+ int32x4_t sumv = vdupq_n_s32(0);
+
+ do {
+ uint8x8_t pre_u8 = load_unaligned_u8(pre, pre_stride);
+ int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8));
+
+ obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv);
+
+ pre += 2 * pre_stride;
+ wsrc += 8;
+ mask += 8;
+ h -= 2;
+ } while (h != 0);
+
+ *sse = horizontal_add_s32x4(ssev);
+ *sum = horizontal_add_s32x4(sumv);
+}
+
+#define OBMC_VARIANCE_WXH_NEON(W, H) \
+ unsigned aom_obmc_variance##W##x##H##_neon( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned *sse) { \
+ int sum; \
+ obmc_variance_neon_##W##xh(pre, pre_stride, wsrc, mask, H, sse, &sum); \
+ return *sse - (unsigned)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+OBMC_VARIANCE_WXH_NEON(4, 4)
+OBMC_VARIANCE_WXH_NEON(4, 8)
+OBMC_VARIANCE_WXH_NEON(8, 4)
+OBMC_VARIANCE_WXH_NEON(8, 8)
+OBMC_VARIANCE_WXH_NEON(8, 16)
+OBMC_VARIANCE_WXH_NEON(16, 8)
+OBMC_VARIANCE_WXH_NEON(16, 16)
+OBMC_VARIANCE_WXH_NEON(16, 32)
+OBMC_VARIANCE_WXH_NEON(32, 16)
+OBMC_VARIANCE_WXH_NEON(32, 32)
+OBMC_VARIANCE_WXH_NEON(32, 64)
+OBMC_VARIANCE_WXH_NEON(64, 32)
+OBMC_VARIANCE_WXH_NEON(64, 64)
+OBMC_VARIANCE_WXH_NEON(64, 128)
+OBMC_VARIANCE_WXH_NEON(128, 64)
+OBMC_VARIANCE_WXH_NEON(128, 128)
+OBMC_VARIANCE_WXH_NEON(4, 16)
+OBMC_VARIANCE_WXH_NEON(16, 4)
+OBMC_VARIANCE_WXH_NEON(8, 32)
+OBMC_VARIANCE_WXH_NEON(32, 8)
+OBMC_VARIANCE_WXH_NEON(16, 64)
+OBMC_VARIANCE_WXH_NEON(64, 16)
diff --git a/third_party/aom/aom_dsp/arm/reinterpret_neon.h b/third_party/aom/aom_dsp/arm/reinterpret_neon.h
new file mode 100644
index 0000000000..f9702513ad
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/reinterpret_neon.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
+#define AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h" // For AOM_FORCE_INLINE.
+#include "config/aom_config.h"
+
+#define REINTERPRET_NEON(u, to_sz, to_count, from_sz, from_count, n, q) \
+ static AOM_FORCE_INLINE u##int##to_sz##x##to_count##x##n##_t \
+ aom_reinterpret##q##_##u##to_sz##_##u##from_sz##_x##n( \
+ const u##int##from_sz##x##from_count##x##n##_t src) { \
+ u##int##to_sz##x##to_count##x##n##_t ret; \
+ for (int i = 0; i < (n); ++i) { \
+ ret.val[i] = vreinterpret##q##_##u##to_sz##_##u##from_sz(src.val[i]); \
+ } \
+ return ret; \
+ }
+
+REINTERPRET_NEON(u, 8, 8, 16, 4, 2, ) // uint8x8x2_t from uint16x4x2_t
+REINTERPRET_NEON(u, 8, 16, 16, 8, 2, q) // uint8x16x2_t from uint16x8x2_t
+
+#endif // AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c
new file mode 100644
index 0000000000..46a1666331
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad_neon.c
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ // We use 8 accumulators to prevent overflow for large values of 'h', as well
+ // as enabling optimal UADALP instruction throughput on CPUs that have either
+ // 2 or 4 Neon pipes.
+ uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
+ uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ diff2 = vabdq_u8(s2, r2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ diff3 = vabdq_u8(s3, r3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ s4 = vld1q_u8(src_ptr + 64);
+ r4 = vld1q_u8(ref_ptr + 64);
+ diff4 = vabdq_u8(s4, r4);
+ sum[4] = vpadalq_u8(sum[4], diff4);
+
+ s5 = vld1q_u8(src_ptr + 80);
+ r5 = vld1q_u8(ref_ptr + 80);
+ diff5 = vabdq_u8(s5, r5);
+ sum[5] = vpadalq_u8(sum[5], diff5);
+
+ s6 = vld1q_u8(src_ptr + 96);
+ r6 = vld1q_u8(ref_ptr + 96);
+ diff6 = vabdq_u8(s6, r6);
+ sum[6] = vpadalq_u8(sum[6], diff6);
+
+ s7 = vld1q_u8(src_ptr + 112);
+ r7 = vld1q_u8(ref_ptr + 112);
+ diff7 = vabdq_u8(s7, r7);
+ sum[7] = vpadalq_u8(sum[7], diff7);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
+ uint8x16_t diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ diff2 = vabdq_u8(s2, r2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ diff3 = vabdq_u8(s3, r3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t diff0 = vabdq_u8(s0, r0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t diff1 = vabdq_u8(s1, r1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+
+ uint8x16_t diff = vabdq_u8(s, r);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+
+ sum = vabal_u8(sum, s, r);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+
+ sum = vabal_u8(sum, s, r);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+#define SAD_WXH_NEON(w, h) \
+ unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
+ }
+
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+SAD_WXH_NEON(64, 128)
+
+SAD_WXH_NEON(128, 64)
+SAD_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_NEON(4, 16)
+SAD_WXH_NEON(8, 32)
+SAD_WXH_NEON(16, 4)
+SAD_WXH_NEON(16, 64)
+SAD_WXH_NEON(32, 8)
+SAD_WXH_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h) \
+ unsigned int aom_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * \
+ sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
+ }
+
+SAD_SKIP_WXH_NEON(4, 4)
+SAD_SKIP_WXH_NEON(4, 8)
+
+SAD_SKIP_WXH_NEON(8, 4)
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
+
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+SAD_SKIP_WXH_NEON(64, 128)
+
+SAD_SKIP_WXH_NEON(128, 64)
+SAD_SKIP_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_NEON(4, 16)
+SAD_SKIP_WXH_NEON(8, 32)
+SAD_SKIP_WXH_NEON(16, 4)
+SAD_SKIP_WXH_NEON(16, 64)
+SAD_SKIP_WXH_NEON(32, 8)
+SAD_SKIP_WXH_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_NEON
+
+static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ // We use 8 accumulators to prevent overflow for large values of 'h', as well
+ // as enabling optimal UADALP instruction throughput on CPUs that have either
+ // 2 or 4 Neon pipes.
+ uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
+ uint8x16_t p0, p1, p2, p3, p4, p5, p6, p7;
+ uint8x16_t avg0, avg1, avg2, avg3, avg4, avg5, avg6, avg7;
+ uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ p2 = vld1q_u8(second_pred + 32);
+ avg2 = vrhaddq_u8(r2, p2);
+ diff2 = vabdq_u8(s2, avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ p3 = vld1q_u8(second_pred + 48);
+ avg3 = vrhaddq_u8(r3, p3);
+ diff3 = vabdq_u8(s3, avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ s4 = vld1q_u8(src_ptr + 64);
+ r4 = vld1q_u8(ref_ptr + 64);
+ p4 = vld1q_u8(second_pred + 64);
+ avg4 = vrhaddq_u8(r4, p4);
+ diff4 = vabdq_u8(s4, avg4);
+ sum[4] = vpadalq_u8(sum[4], diff4);
+
+ s5 = vld1q_u8(src_ptr + 80);
+ r5 = vld1q_u8(ref_ptr + 80);
+ p5 = vld1q_u8(second_pred + 80);
+ avg5 = vrhaddq_u8(r5, p5);
+ diff5 = vabdq_u8(s5, avg5);
+ sum[5] = vpadalq_u8(sum[5], diff5);
+
+ s6 = vld1q_u8(src_ptr + 96);
+ r6 = vld1q_u8(ref_ptr + 96);
+ p6 = vld1q_u8(second_pred + 96);
+ avg6 = vrhaddq_u8(r6, p6);
+ diff6 = vabdq_u8(s6, avg6);
+ sum[6] = vpadalq_u8(sum[6], diff6);
+
+ s7 = vld1q_u8(src_ptr + 112);
+ r7 = vld1q_u8(ref_ptr + 112);
+ p7 = vld1q_u8(second_pred + 112);
+ avg7 = vrhaddq_u8(r7, p7);
+ diff7 = vabdq_u8(s7, avg7);
+ sum[7] = vpadalq_u8(sum[7], diff7);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 128;
+ } while (--i != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+ uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ s1 = vld1q_u8(src_ptr + 16);
+ r1 = vld1q_u8(ref_ptr + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ s2 = vld1q_u8(src_ptr + 32);
+ r2 = vld1q_u8(ref_ptr + 32);
+ p2 = vld1q_u8(second_pred + 32);
+ avg2 = vrhaddq_u8(r2, p2);
+ diff2 = vabdq_u8(s2, avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ s3 = vld1q_u8(src_ptr + 48);
+ r3 = vld1q_u8(ref_ptr + 48);
+ p3 = vld1q_u8(second_pred + 48);
+ avg3 = vrhaddq_u8(r3, p3);
+ diff3 = vabdq_u8(s3, avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--i != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t avg0 = vrhaddq_u8(r0, p0);
+ uint8x16_t diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t avg1 = vrhaddq_u8(r1, p1);
+ uint8x16_t diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = vrhaddq_u8(r, p);
+ uint8x16_t diff = vabdq_u8(s, avg);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h;
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t avg = vrhadd_u8(r, p);
+ sum = vabal_u8(sum, s, avg);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h,
+ const uint8_t *second_pred) {
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t avg = vrhadd_u8(r, p);
+ sum = vabal_u8(sum, s, avg);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+#define SAD_WXH_AVG_NEON(w, h) \
+ unsigned int aom_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+SAD_WXH_AVG_NEON(4, 4)
+SAD_WXH_AVG_NEON(4, 8)
+
+SAD_WXH_AVG_NEON(8, 4)
+SAD_WXH_AVG_NEON(8, 8)
+SAD_WXH_AVG_NEON(8, 16)
+
+SAD_WXH_AVG_NEON(16, 8)
+SAD_WXH_AVG_NEON(16, 16)
+SAD_WXH_AVG_NEON(16, 32)
+
+SAD_WXH_AVG_NEON(32, 16)
+SAD_WXH_AVG_NEON(32, 32)
+SAD_WXH_AVG_NEON(32, 64)
+
+SAD_WXH_AVG_NEON(64, 32)
+SAD_WXH_AVG_NEON(64, 64)
+SAD_WXH_AVG_NEON(64, 128)
+
+SAD_WXH_AVG_NEON(128, 64)
+SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON(4, 16)
+SAD_WXH_AVG_NEON(8, 32)
+SAD_WXH_AVG_NEON(16, 4)
+SAD_WXH_AVG_NEON(16, 64)
+SAD_WXH_AVG_NEON(32, 8)
+SAD_WXH_AVG_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ // We use 8 accumulators to prevent overflow for large values of 'h', as well
+ // as enabling optimal UADALP instruction throughput on CPUs that have either
+ // 2 or 4 Neon pipes.
+ uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+ uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+ uint8x16_t p4 = vld1q_u8(second_pred + 64);
+ uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+ uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+ sum[4] = vpadalq_u8(sum[4], diff4);
+
+ uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+ uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+ uint8x16_t p5 = vld1q_u8(second_pred + 80);
+ uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+ uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+ sum[5] = vpadalq_u8(sum[5], diff5);
+
+ uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+ uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+ uint8x16_t p6 = vld1q_u8(second_pred + 96);
+ uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+ uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+ sum[6] = vpadalq_u8(sum[6], diff6);
+
+ uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+ uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+ uint8x16_t p7 = vld1q_u8(second_pred + 112);
+ uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+ uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+ sum[7] = vpadalq_u8(sum[7], diff7);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 128;
+ } while (--h != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vpadalq_u8(sum[2], diff2);
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vpadalq_u8(sum[3], diff3);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--h != 0);
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+ return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vpadalq_u8(sum[0], diff0);
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vpadalq_u8(sum[1], diff1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ uint8x16_t r = vld1q_u8(ref_ptr);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t wtd_avg = dist_wtd_avg_u8x16(p, r, bck_offset, fwd_offset);
+ uint8x16_t diff = vabdq_u8(s, wtd_avg);
+ sum = vpadalq_u8(sum, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad8xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t r = vld1_u8(ref_ptr);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+ sum = vabal_u8(sum, s, wtd_avg);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
+ } while (--h != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad4xh_avg_neon(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ uint16x8_t sum = vdupq_n_u16(0);
+
+ int i = h / 2;
+ do {
+ uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+
+ uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+ sum = vabal_u8(sum, s, wtd_avg);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ second_pred += 8;
+ } while (--i != 0);
+
+ return horizontal_add_u16x8(sum);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON(w, h) \
+ unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return dist_wtd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+ second_pred, jcp_param); \
+ }
+
+DIST_WTD_SAD_WXH_AVG_NEON(4, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(4, 8)
+
+DIST_WTD_SAD_WXH_AVG_NEON(8, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 16)
+
+DIST_WTD_SAD_WXH_AVG_NEON(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON(4, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON
diff --git a/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 0000000000..5504c6838e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int h) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ diff0 = vabdq_u8(s0, r0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ diff1 = vabdq_u8(s1, r1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+ }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+SAD_WXH_NEON_DOTPROD(64, 128)
+
+SAD_WXH_NEON_DOTPROD(128, 64)
+SAD_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_NEON_DOTPROD(16, 4)
+SAD_WXH_NEON_DOTPROD(16, 64)
+SAD_WXH_NEON_DOTPROD(32, 8)
+SAD_WXH_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad_skip_##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+ 2 * ref_stride, (h) / 2); \
+ }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ second_pred += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ p1 = vld1q_u8(second_pred);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad##w##x##h##_avg_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ // We use 8 accumulators to minimize the accumulation and loop carried
+ // dependencies for better instruction throughput.
+ uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+ uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+ uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+ uint8x16_t p4 = vld1q_u8(second_pred + 64);
+ uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+ uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+ sum[4] = vdotq_u32(sum[4], diff4, vdupq_n_u8(1));
+
+ uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+ uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+ uint8x16_t p5 = vld1q_u8(second_pred + 80);
+ uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+ uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+ sum[5] = vdotq_u32(sum[5], diff5, vdupq_n_u8(1));
+
+ uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+ uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+ uint8x16_t p6 = vld1q_u8(second_pred + 96);
+ uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+ uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+ sum[6] = vdotq_u32(sum[6], diff6, vdupq_n_u8(1));
+
+ uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+ uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+ uint8x16_t p7 = vld1q_u8(second_pred + 112);
+ uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+ uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+ sum[7] = vdotq_u32(sum[7], diff7, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 128;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[4] = vaddq_u32(sum[4], sum[5]);
+ sum[6] = vaddq_u32(sum[6], sum[7]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+ sum[4] = vaddq_u32(sum[4], sum[6]);
+ sum[0] = vaddq_u32(sum[0], sum[4]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+ uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+ uint8x16_t p2 = vld1q_u8(second_pred + 32);
+ uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+ uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+ sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+ uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+ uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+ uint8x16_t p3 = vld1q_u8(second_pred + 48);
+ uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+ uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+ sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sum[2] = vaddq_u32(sum[2], sum[3]);
+ sum[0] = vaddq_u32(sum[0], sum[2]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+ uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+ uint8x16_t p1 = vld1q_u8(second_pred + 16);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
+ } while (--h != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+ uint8x16_t r0 = vld1q_u8(ref_ptr);
+ uint8x16_t p0 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+ uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ uint8x16_t s1 = vld1q_u8(src_ptr);
+ uint8x16_t r1 = vld1q_u8(ref_ptr);
+ uint8x16_t p1 = vld1q_u8(second_pred);
+ uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+ uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ return horizontal_add_u32x4(sum[0]);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(w, h) \
+ unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return dist_wtd_sad##w##xh_avg_neon_dotprod( \
+ src, src_stride, ref, ref_stride, (h), second_pred, jcp_param); \
+ }
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/arm/sadxd_neon.c b/third_party/aom/aom_dsp/arm/sadxd_neon.c
new file mode 100644
index 0000000000..e89e1c5a73
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sadxd_neon.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+ uint16x8_t *const sad_sum) {
+ uint8x16_t abs_diff = vabdq_u8(src, ref);
+ *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
+}
+
+static INLINE void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3],
+ int ref_stride, uint32_t res[3], int w,
+ int h, int h_overflow) {
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ int h_limit = h > h_overflow ? h_overflow : h;
+
+ int ref_offset = 0;
+ int i = 0;
+ do {
+ uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+ uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+
+ j += 32;
+ } while (j < w);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (++i < h_limit);
+
+ sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+ sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+ sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+ sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+ sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+ sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+
+ h_limit += h_overflow;
+ } while (i < h);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void sad128xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
+}
+
+static INLINE void sad64xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
+}
+
+static INLINE void sad32xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+ uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
+
+ const uint8x16_t s1 = vld1q_u8(src + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
+ res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
+ res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
+}
+
+static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u16x8(sum[0]);
+ res[1] = horizontal_add_u16x8(sum[1]);
+ res[2] = horizontal_add_u16x8(sum[2]);
+}
+
+static INLINE void sad8xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ uint16x8_t sum[3];
+
+ uint8x8_t s = vld1_u8(src);
+ sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
+ sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
+ sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
+
+ src += src_stride;
+ int ref_offset = ref_stride;
+ int i = h - 1;
+ do {
+ s = vld1_u8(src);
+ sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
+ sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
+ sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u16x8(sum[0]);
+ res[1] = horizontal_add_u16x8(sum[1]);
+ res[2] = horizontal_add_u16x8(sum[2]);
+}
+
+static INLINE void sad4xhx3d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[3], int ref_stride,
+ uint32_t res[3], int h) {
+ assert(h % 2 == 0);
+ uint16x8_t sum[3];
+
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
+
+ sum[0] = vabdl_u8(s, r0);
+ sum[1] = vabdl_u8(s, r1);
+ sum[2] = vabdl_u8(s, r2);
+
+ src += 2 * src_stride;
+ int ref_offset = 2 * ref_stride;
+ int i = (h / 2) - 1;
+ do {
+ s = load_unaligned_u8(src, src_stride);
+ r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+ r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+ r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+
+ sum[0] = vabal_u8(sum[0], s, r0);
+ sum[1] = vabal_u8(sum[1], s, r1);
+ sum[2] = vabal_u8(sum[2], s, r2);
+
+ src += 2 * src_stride;
+ ref_offset += 2 * ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u16x8(sum[0]);
+ res[1] = horizontal_add_u16x8(sum[1]);
+ res[2] = horizontal_add_u16x8(sum[2]);
+}
+
+#define SAD_WXH_3D_NEON(w, h) \
+ void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h)); \
+ }
+
+SAD_WXH_3D_NEON(4, 4)
+SAD_WXH_3D_NEON(4, 8)
+
+SAD_WXH_3D_NEON(8, 4)
+SAD_WXH_3D_NEON(8, 8)
+SAD_WXH_3D_NEON(8, 16)
+
+SAD_WXH_3D_NEON(16, 8)
+SAD_WXH_3D_NEON(16, 16)
+SAD_WXH_3D_NEON(16, 32)
+
+SAD_WXH_3D_NEON(32, 16)
+SAD_WXH_3D_NEON(32, 32)
+SAD_WXH_3D_NEON(32, 64)
+
+SAD_WXH_3D_NEON(64, 32)
+SAD_WXH_3D_NEON(64, 64)
+SAD_WXH_3D_NEON(64, 128)
+
+SAD_WXH_3D_NEON(128, 64)
+SAD_WXH_3D_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_3D_NEON(4, 16)
+SAD_WXH_3D_NEON(8, 32)
+SAD_WXH_3D_NEON(16, 4)
+SAD_WXH_3D_NEON(16, 64)
+SAD_WXH_3D_NEON(32, 8)
+SAD_WXH_3D_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_3D_NEON
+
+static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4], int w,
+ int h, int h_overflow) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ int h_limit = h > h_overflow ? h_overflow : h;
+
+ int ref_offset = 0;
+ int i = 0;
+ do {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
+
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
+
+ j += 32;
+ } while (j < w);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (++i < h_limit);
+
+ sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+ sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+ sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+ sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+ sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+ sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+ sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
+ sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
+
+ h_limit += h_overflow;
+ } while (i < h);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
+}
+
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
+}
+
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]);
+
+ const uint8x16_t s1 = vld1q_u8(src + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
+ uint32x4_t sum_u32[4];
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ sum_u32[0] = vpaddlq_u16(sum_u16[0]);
+ sum_u32[1] = vpaddlq_u16(sum_u16[1]);
+ sum_u32[2] = vpaddlq_u16(sum_u16[2]);
+ sum_u32[3] = vpaddlq_u16(sum_u16[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4];
+
+ uint8x8_t s = vld1_u8(src);
+ sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
+ sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
+ sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
+ sum[3] = vabdl_u8(s, vld1_u8(ref[3]));
+
+ src += src_stride;
+ int ref_offset = ref_stride;
+ int i = h - 1;
+ do {
+ s = vld1_u8(src);
+ sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
+ sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
+ sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
+ sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset));
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t res[4], int h) {
+ uint16x8_t sum[4];
+
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
+ uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
+ uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
+ uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride);
+
+ sum[0] = vabdl_u8(s, r0);
+ sum[1] = vabdl_u8(s, r1);
+ sum[2] = vabdl_u8(s, r2);
+ sum[3] = vabdl_u8(s, r3);
+
+ src += 2 * src_stride;
+ int ref_offset = 2 * ref_stride;
+ int i = h / 2;
+ while (--i != 0) {
+ s = load_unaligned_u8(src, src_stride);
+ r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+ r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+ r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+ r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
+
+ sum[0] = vabal_u8(sum[0], s, r0);
+ sum[1] = vabal_u8(sum[1], s, r1);
+ sum[2] = vabal_u8(sum[2], s, r2);
+ sum[3] = vabal_u8(sum[3], s, r3);
+
+ src += 2 * src_stride;
+ ref_offset += 2 * ref_stride;
+ }
+
+ vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+#define SAD_WXH_4D_NEON(w, h) \
+ void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \
+ }
+
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+SAD_WXH_4D_NEON(64, 128)
+
+SAD_WXH_4D_NEON(128, 64)
+SAD_WXH_4D_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_4D_NEON(4, 16)
+SAD_WXH_4D_NEON(8, 32)
+SAD_WXH_4D_NEON(16, 4)
+SAD_WXH_4D_NEON(16, 64)
+SAD_WXH_4D_NEON(32, 8)
+SAD_WXH_4D_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_4D_NEON
+
+#define SAD_SKIP_WXH_4D_NEON(w, h) \
+ void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \
+ ((h) >> 1)); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SAD_SKIP_WXH_4D_NEON(4, 4)
+SAD_SKIP_WXH_4D_NEON(4, 8)
+
+SAD_SKIP_WXH_4D_NEON(8, 4)
+SAD_SKIP_WXH_4D_NEON(8, 8)
+SAD_SKIP_WXH_4D_NEON(8, 16)
+
+SAD_SKIP_WXH_4D_NEON(16, 8)
+SAD_SKIP_WXH_4D_NEON(16, 16)
+SAD_SKIP_WXH_4D_NEON(16, 32)
+
+SAD_SKIP_WXH_4D_NEON(32, 16)
+SAD_SKIP_WXH_4D_NEON(32, 32)
+SAD_SKIP_WXH_4D_NEON(32, 64)
+
+SAD_SKIP_WXH_4D_NEON(64, 32)
+SAD_SKIP_WXH_4D_NEON(64, 64)
+SAD_SKIP_WXH_4D_NEON(64, 128)
+
+SAD_SKIP_WXH_4D_NEON(128, 64)
+SAD_SKIP_WXH_4D_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_4D_NEON(4, 16)
+SAD_SKIP_WXH_4D_NEON(8, 32)
+SAD_SKIP_WXH_4D_NEON(16, 4)
+SAD_SKIP_WXH_4D_NEON(16, 64)
+SAD_SKIP_WXH_4D_NEON(32, 8)
+SAD_SKIP_WXH_4D_NEON(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_4D_NEON
diff --git a/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c
new file mode 100644
index 0000000000..3d11d1cb96
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+ uint32x4_t *const sad_sum) {
+ uint8x16_t abs_diff = vabdq_u8(src, ref);
+ *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sadwxhx3d_large_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+
+ j += 32;
+ } while (j < w);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+ res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+ res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void sad128xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
+static INLINE void sad64xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
+}
+
+static INLINE void sad32xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
+}
+
+static INLINE void sad16xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ res[0] = horizontal_add_u32x4(sum[0]);
+ res[1] = horizontal_add_u32x4(sum[1]);
+ res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+#define SAD_WXH_3D_NEON_DOTPROD(w, h) \
+ void aom_sad##w##x##h##x3d_neon_dotprod(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx3d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \
+ }
+
+SAD_WXH_3D_NEON_DOTPROD(16, 8)
+SAD_WXH_3D_NEON_DOTPROD(16, 16)
+SAD_WXH_3D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_3D_NEON_DOTPROD(32, 16)
+SAD_WXH_3D_NEON_DOTPROD(32, 32)
+SAD_WXH_3D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_3D_NEON_DOTPROD(64, 32)
+SAD_WXH_3D_NEON_DOTPROD(64, 64)
+SAD_WXH_3D_NEON_DOTPROD(64, 128)
+
+SAD_WXH_3D_NEON_DOTPROD(128, 64)
+SAD_WXH_3D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_3D_NEON_DOTPROD(16, 4)
+SAD_WXH_3D_NEON_DOTPROD(16, 64)
+SAD_WXH_3D_NEON_DOTPROD(32, 8)
+SAD_WXH_3D_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_3D_NEON_DOTPROD
+
+static INLINE void sadwxhx4d_large_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int w, int h) {
+ uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint32x4_t sum[4];
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ const uint8x16_t s0 = vld1q_u8(src + j);
+ sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+ sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+ sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+ sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
+
+ const uint8x16_t s1 = vld1q_u8(src + j + 16);
+ sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+ sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+ sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+ sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
+
+ j += 32;
+ } while (j < w);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+ sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+ sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+ sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void sad128xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
+static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
+}
+
+static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
+}
+
+static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride, uint32_t res[4],
+ int h) {
+ uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+
+ int ref_offset = 0;
+ int i = h;
+ do {
+ const uint8x16_t s = vld1q_u8(src);
+ sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+ sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+ sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+ sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]);
+
+ src += src_stride;
+ ref_offset += ref_stride;
+ } while (--i != 0);
+
+ vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+#define SAD_WXH_4D_NEON_DOTPROD(w, h) \
+ void aom_sad##w##x##h##x4d_neon_dotprod(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx4d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \
+ }
+
+SAD_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_WXH_4D_NEON_DOTPROD(64, 64)
+SAD_WXH_4D_NEON_DOTPROD(64, 128)
+
+SAD_WXH_4D_NEON_DOTPROD(128, 64)
+SAD_WXH_4D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_4D_NEON_DOTPROD(16, 4)
+SAD_WXH_4D_NEON_DOTPROD(16, 64)
+SAD_WXH_4D_NEON_DOTPROD(32, 8)
+SAD_WXH_4D_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_4D_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \
+ void aom_sad_skip_##w##x##h##x4d_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ sad##w##xhx4d_neon_dotprod(src, 2 * src_stride, ref, 2 * ref_stride, res, \
+ ((h) >> 1)); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_4D_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/arm/sse_neon.c b/third_party/aom/aom_dsp/arm/sse_neon.c
new file mode 100644
index 0000000000..ec8f0ee183
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sse_neon.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
+ uint32x4_t *sse) {
+ uint8x16_t s = vld1q_u8(src);
+ uint8x16_t r = vld1q_u8(ref);
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ uint8x8_t abs_diff_lo = vget_low_u8(abs_diff);
+ uint8x8_t abs_diff_hi = vget_high_u8(abs_diff);
+
+ *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo));
+ *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
+}
+
+static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
+ uint32x4_t *sse) {
+ uint8x8_t s = vld1_u8(src);
+ uint8x8_t r = vld1_u8(ref);
+
+ uint8x8_t abs_diff = vabd_u8(s, r);
+
+ *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ uint32x4_t *sse) {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+ uint8x8_t abs_diff = vabd_u8(s, r);
+
+ *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int width, int height) {
+ uint32x4_t sse = vdupq_n_u32(0);
+
+ if ((width & 0x07) && ((width & 0x07) < 5)) {
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sse_8x1_neon(src + j, ref + j, &sse);
+ sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse);
+ j += 8;
+ } while (j + 4 < width);
+
+ sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+ } else {
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sse_8x1_neon(src + j, ref + j, &sse);
+ j += 8;
+ } while (j < width);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+ }
+ return horizontal_add_u32x4(sse);
+}
+
+static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon(src, ref, &sse[0]);
+ sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+ sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+ sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+ sse_16x1_neon(src + 64, ref + 64, &sse[0]);
+ sse_16x1_neon(src + 80, ref + 80, &sse[1]);
+ sse_16x1_neon(src + 96, ref + 96, &sse[0]);
+ sse_16x1_neon(src + 112, ref + 112, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon(src, ref, &sse[0]);
+ sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+ sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+ sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon(src, ref, &sse[0]);
+ sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon(src, ref, &sse[0]);
+ src += src_stride;
+ ref += ref_stride;
+ sse_16x1_neon(src, ref, &sse[1]);
+ src += src_stride;
+ ref += ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse = vdupq_n_u32(0);
+
+ int i = height;
+ do {
+ sse_8x1_neon(src, ref, &sse);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(sse);
+}
+
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse = vdupq_n_u32(0);
+
+ int i = height;
+ do {
+ sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x4(sse);
+}
+
+int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
+ int ref_stride, int width, int height) {
+ switch (width) {
+ case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+ case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+ case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+ case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+ case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+ case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+ default:
+ return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c
new file mode 100644
index 0000000000..979049780b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+ uint32x4_t *sse) {
+ uint8x16_t s = vld1q_u8(src);
+ uint8x16_t r = vld1q_u8(ref);
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+
+ *sse = vdotq_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+ uint32x2_t *sse) {
+ uint8x8_t s = vld1_u8(src);
+ uint8x8_t r = vld1_u8(ref);
+
+ uint8x8_t abs_diff = vabd_u8(s, r);
+
+ *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ uint32x2_t *sse) {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+ uint8x8_t abs_diff = vabd_u8(s, r);
+
+ *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int width, int height) {
+ uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+ if ((width & 0x07) && ((width & 0x07) < 5)) {
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+ sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+ &sse[1]);
+ j += 8;
+ } while (j + 4 < width);
+
+ sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+ } else {
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+ sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+ &sse[1]);
+ j += 8;
+ } while (j < width);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+ }
+ return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+ sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+ sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+ sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]);
+ sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]);
+ sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]);
+ sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+ sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+ sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_16x1_neon_dotprod(src, ref, &sse[0]);
+ src += src_stride;
+ ref += ref_stride;
+ sse_16x1_neon_dotprod(src, ref, &sse[1]);
+ src += src_stride;
+ ref += ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+ int i = height;
+ do {
+ sse_8x1_neon_dotprod(src, ref, &sse[0]);
+ src += src_stride;
+ ref += ref_stride;
+ sse_8x1_neon_dotprod(src, ref, &sse[1]);
+ src += src_stride;
+ ref += ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int height) {
+ uint32x2_t sse = vdup_n_u32(0);
+
+ int i = height;
+ do {
+ sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ return horizontal_add_u32x2(sse);
+}
+
+int64_t aom_sse_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int width,
+ int height) {
+ switch (width) {
+ case 4:
+ return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 8:
+ return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 16:
+ return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 32:
+ return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 64:
+ return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ case 128:
+ return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+ default:
+ return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
+ height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 0000000000..2e6e738853
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,1103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ vst1_u8(dst_ptr, blend_u8);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height, int filter_offset) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+ blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+ uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+ blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+ vst1q_u8(dst_ptr + j, blend_u8);
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_height, int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset) {
+ var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
+ dst_height, filter_offset);
+}
+
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int pixel_step,
+ int dst_width, int dst_height) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse) { \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \
+ yoffset); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+
+SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+
+SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ second_pred += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
+// width 4.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+ uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+ uint8x8_t p = vld1_u8(second_pred);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * 4;
+ second_pred += 2 * 4;
+ i -= 2;
+ } while (i != 0);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_height,
+ int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+ uint8x8_t p = vld1_u8(second_pred);
+ uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ second_pred += 8;
+ } while (--i > 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
+// width 8.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+ const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+ uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+ uint8x8_t p = vld1_u8(second_pred);
+ uint16x8_t blend = vmull_u8(s0, f0);
+ blend = vmlal_u8(blend, s1, f1);
+ uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+ uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
+
+ vst1_u8(dst_ptr, avg);
+
+ src_ptr += src_stride;
+ dst_ptr += 8;
+ second_pred += 8;
+ } while (--i > 0);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint8_t *second_pred) {
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+ blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+ uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+ blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, int filter_offset,
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+ const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+ const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+ blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+ uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+ blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+ uint8x16_t blend_u8 =
+ vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = dist_wtd_avg_u8x16(blend_u8, p, fwd_offset, bck_offset);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 16, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 32, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 64, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void avg_pred_var_filter_block2d_bil_w128(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred) {
+ avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+ pixel_step, 128, dst_height,
+ filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_height, int filter_offset, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ dist_wtd_avg_pred_var_filter_block2d_bil_large(
+ src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+ second_pred, jcp_param);
+}
+
+// Combine averaging subpel filter with aom_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+ uint8_t *dst_ptr, int src_stride,
+ int pixel_step, int dst_width,
+ int dst_height,
+ const uint8_t *second_pred) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+ uint8x16_t p = vld1q_u8(second_pred);
+ avg = vrhaddq_u8(avg, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Combine averaging subpel filter with aom_dist_wtd_comp_avg_pred.
+static void dist_wtd_avg_pred_var_filter_block2d_avg(
+ const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+ int dst_width, int dst_height, const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr + j);
+ uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+ uint8x16_t p = vld1q_u8(second_pred);
+ uint8x16_t avg = vrhaddq_u8(s0, s1);
+ avg = dist_wtd_avg_u8x16(avg, p, fwd_offset, bck_offset);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of aom_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+ int dst_width, int dst_height,
+ const uint8_t *second_pred) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr + j);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = vrhaddq_u8(s, p);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+// Implementation of aom_dist_wtd_comp_avg_pred for blocks having width >= 16.
+static void dist_wtd_avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr,
+ int src_stride, int dst_width, int dst_height,
+ const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ // We only specialise on the filter values for large block sizes (>= 16x16.)
+ assert(dst_width >= 16 && dst_width % 16 == 0);
+ const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+ const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+ int i = dst_height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr + j);
+ uint8x16_t p = vld1q_u8(second_pred);
+
+ uint8x16_t avg = dist_wtd_avg_u8x16(s, p, fwd_offset, bck_offset);
+
+ vst1q_u8(dst_ptr + j, avg);
+
+ j += 16;
+ second_pred += 16;
+ } while (j < dst_width);
+
+ src_ptr += src_stride;
+ dst_ptr += dst_width;
+ } while (--i != 0);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+ xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ if (xoffset == 0) { \
+ uint8_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ avg_pred(src, tmp, source_stride, w, h, second_pred); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
+ source_stride, w, h, second_pred); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second_pred); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \
+ second_pred); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \
+ xoffset, second_pred); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \
+ second_pred); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
+
+#if !CONFIG_REALTIME_ONLY
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SUBPEL_AVG_VARIANCE_WXH_NEON
+#undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
+
+#define DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+ xoffset); \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int source_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, unsigned int *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ if (xoffset == 0) { \
+ uint8_t tmp[w * h]; \
+ if (yoffset == 0) { \
+ dist_wtd_avg_pred(src, tmp, source_stride, w, h, second_pred, \
+ jcp_param); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ dist_wtd_avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \
+ source_stride, w, h, \
+ second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } else { \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp, source_stride, source_stride, h, yoffset, second_pred, \
+ jcp_param); \
+ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ dist_wtd_avg_pred_var_filter_block2d_avg( \
+ src, tmp0, source_stride, 1, w, h, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, \
+ second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ src, tmp0, source_stride, 1, h, xoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, \
+ second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \
+ (h + padding), xoffset); \
+ dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
+
+#if !CONFIG_REALTIME_ONLY
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
+#undef SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
+
+#if !CONFIG_REALTIME_ONLY
+
+#define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse); \
+ }
+
+#define SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ if (xoffset == 0) { \
+ if (yoffset == 0) { \
+ return aom_obmc_variance##w##x##h##_neon(pre, pre_stride, wsrc, mask, \
+ sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_avg(pre, tmp, pre_stride, pre_stride, w, h); \
+ return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \
+ } else { \
+ uint8_t tmp[w * h]; \
+ var_filter_block2d_bil_w##w(pre, tmp, pre_stride, pre_stride, h, \
+ yoffset); \
+ return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h); \
+ return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
+ } else { \
+ uint8_t tmp1[w * (h + padding)]; \
+ var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
+ } \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h, xoffset); \
+ return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \
+ } \
+ } \
+ }
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+#undef OBMC_SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON
+#endif // !CONFIG_REALTIME_ONLY
+
+#define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ uint8_t tmp2[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \
+ invert_mask); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
+ }
+
+#define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \
+ unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ if (xoffset == 0) { \
+ uint8_t tmp0[w * h]; \
+ if (yoffset == 0) { \
+ aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h); \
+ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h, \
+ yoffset); \
+ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } \
+ } else if (xoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ if (yoffset == 0) { \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \
+ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp1[w * h]; \
+ uint8_t tmp2[w * h]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp1[w * h]; \
+ uint8_t tmp2[w * h]; \
+ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
+ } \
+ } else { \
+ if (yoffset == 0) { \
+ uint8_t tmp0[w * h]; \
+ uint8_t tmp1[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \
+ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \
+ } else if (yoffset == 4) { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * h]; \
+ uint8_t tmp2[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
+ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
+ } else { \
+ uint8_t tmp0[w * (h + padding)]; \
+ uint8_t tmp1[w * (h + padding)]; \
+ uint8_t tmp2[w * h]; \
+ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+ xoffset); \
+ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
+ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \
+ msk_stride, invert_mask); \
+ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \
+ } \
+ } \
+ }
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef MASKED_SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON
diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c
new file mode 100644
index 0000000000..a195c40d19
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subtract_neon.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ if (cols > 16) {
+ int r = rows;
+ do {
+ int c = 0;
+ do {
+ const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+ const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+ const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+ const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+ const uint16x8_t v_diff_lo_00 =
+ vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
+ const uint16x8_t v_diff_hi_00 =
+ vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
+ const uint16x8_t v_diff_lo_16 =
+ vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
+ const uint16x8_t v_diff_hi_16 =
+ vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+ vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+ vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+ c += 32;
+ } while (c < cols);
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ } else if (cols > 8) {
+ int r = rows;
+ do {
+ const uint8x16_t v_src = vld1q_u8(&src[0]);
+ const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+ const uint16x8_t v_diff_lo =
+ vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
+ const uint16x8_t v_diff_hi =
+ vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ } else if (cols > 4) {
+ int r = rows;
+ do {
+ const uint8x8_t v_src = vld1_u8(&src[0]);
+ const uint8x8_t v_pred = vld1_u8(&pred[0]);
+ const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ } else {
+ int r = rows;
+ do {
+ int c = 0;
+ do {
+ diff[c] = src[c] - pred[c];
+ } while (++c < cols);
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_subtract_block_neon(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+ if (cols > 16) {
+ int r = rows;
+ do {
+ int c = 0;
+ do {
+ const uint16x8_t v_src_00 = vld1q_u16(&src[c + 0]);
+ const uint16x8_t v_pred_00 = vld1q_u16(&pred[c + 0]);
+ const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00);
+ const uint16x8_t v_src_08 = vld1q_u16(&src[c + 8]);
+ const uint16x8_t v_pred_08 = vld1q_u16(&pred[c + 8]);
+ const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08);
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_00));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_08));
+ c += 16;
+ } while (c < cols);
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ } else if (cols > 8) {
+ int r = rows;
+ do {
+ const uint16x8_t v_src_00 = vld1q_u16(&src[0]);
+ const uint16x8_t v_pred_00 = vld1q_u16(&pred[0]);
+ const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00);
+ const uint16x8_t v_src_08 = vld1q_u16(&src[8]);
+ const uint16x8_t v_pred_08 = vld1q_u16(&pred[8]);
+ const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_00));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_08));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r != 0);
+ } else if (cols > 4) {
+ int r = rows;
+ do {
+ const uint16x8_t v_src_r0 = vld1q_u16(&src[0]);
+ const uint16x8_t v_src_r1 = vld1q_u16(&src[src_stride]);
+ const uint16x8_t v_pred_r0 = vld1q_u16(&pred[0]);
+ const uint16x8_t v_pred_r1 = vld1q_u16(&pred[pred_stride]);
+ const uint16x8_t v_diff_r0 = vsubq_u16(v_src_r0, v_pred_r0);
+ const uint16x8_t v_diff_r1 = vsubq_u16(v_src_r1, v_pred_r1);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_r0));
+ vst1q_s16(&diff[diff_stride], vreinterpretq_s16_u16(v_diff_r1));
+ diff += diff_stride << 1;
+ pred += pred_stride << 1;
+ src += src_stride << 1;
+ r -= 2;
+ } while (r != 0);
+ } else {
+ int r = rows;
+ do {
+ const uint16x4_t v_src_r0 = vld1_u16(&src[0]);
+ const uint16x4_t v_src_r1 = vld1_u16(&src[src_stride]);
+ const uint16x4_t v_pred_r0 = vld1_u16(&pred[0]);
+ const uint16x4_t v_pred_r1 = vld1_u16(&pred[pred_stride]);
+ const uint16x4_t v_diff_r0 = vsub_u16(v_src_r0, v_pred_r0);
+ const uint16x4_t v_diff_r1 = vsub_u16(v_src_r1, v_pred_r1);
+ vst1_s16(&diff[0], vreinterpret_s16_u16(v_diff_r0));
+ vst1_s16(&diff[diff_stride], vreinterpret_s16_u16(v_diff_r1));
+ diff += diff_stride << 1;
+ pred += pred_stride << 1;
+ src += src_stride << 1;
+ r -= 2;
+ } while (r != 0);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/arm/sum_neon.h b/third_party/aom/aom_dsp/arm/sum_neon.h
new file mode 100644
index 0000000000..30a108e70a
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_neon.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_SUM_NEON_H_
+#define AOM_AOM_DSP_ARM_SUM_NEON_H_
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_u8x8(const uint8x8_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlv_u8(a);
+#else
+ uint16x4_t b = vpaddl_u8(a);
+ uint32x2_t c = vpaddl_u16(b);
+ return vget_lane_u32(c, 0) + vget_lane_u32(c, 1);
+#endif
+}
+
+static INLINE int horizontal_add_s16x8(const int16x8_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_s16(a);
+#else
+ const int32x4_t b = vpaddlq_s16(a);
+ const int64x2_t c = vpaddlq_s32(b);
+ const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+ vreinterpret_s32_s64(vget_high_s64(c)));
+ return vget_lane_s32(d, 0);
+#endif
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddvq_s32(a);
+#else
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+#endif
+}
+
+static INLINE int64_t horizontal_add_s64x2(const int64x2_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddvq_s64(a);
+#else
+ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_add_u64x2(const uint64x2_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddvq_u64(a);
+#else
+ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4(const uint32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
+#endif
+}
+
+static INLINE int64_t horizontal_long_add_s32x4(const int32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_s32(a);
+#else
+ const int64x2_t b = vpaddlq_s32(a);
+ return vgetq_lane_s64(b, 0) + vgetq_lane_s64(b, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u32x4(const uint32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) {
+#if AOM_ARCH_AARCH64
+ uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+ uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+ return vpaddq_u32(res01, res23);
+#else
+ uint32x4_t res = vdupq_n_u32(0);
+ res = vsetq_lane_u32(horizontal_add_u32x4(sum[0]), res, 0);
+ res = vsetq_lane_u32(horizontal_add_u32x4(sum[1]), res, 1);
+ res = vsetq_lane_u32(horizontal_add_u32x4(sum[2]), res, 2);
+ res = vsetq_lane_u32(horizontal_add_u32x4(sum[3]), res, 3);
+ return res;
+#endif
+}
+
+static INLINE int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) {
+#if AOM_ARCH_AARCH64
+ int32x4_t res01 = vpaddq_s32(sum[0], sum[1]);
+ int32x4_t res23 = vpaddq_s32(sum[2], sum[3]);
+ return vpaddq_s32(res01, res23);
+#else
+ int32x4_t res = vdupq_n_s32(0);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[0]), res, 0);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[1]), res, 1);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[2]), res, 2);
+ res = vsetq_lane_s32(horizontal_add_s32x4(sum[3]), res, 3);
+ return res;
+#endif
+}
+
+static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
+ const uint16x8_t vec_hi) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+ const uint32x4_t vec_l_lo =
+ vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi =
+ vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+ const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_long_add_4d_u16x8(
+ const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
+ const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
+ const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
+ const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
+ const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
+ const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
+ const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
+ const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
+ const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
+#if AOM_ARCH_AARCH64
+ const uint32x4_t c0 = vpaddq_u32(b0, b1);
+ const uint32x4_t c1 = vpaddq_u32(b2, b3);
+ return vpaddq_u32(c0, c1);
+#else
+ const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+ const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+ const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+ const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+ const uint32x2_t d0 = vpadd_u32(c0, c1);
+ const uint32x2_t d1 = vpadd_u32(c2, c3);
+ return vcombine_u32(d0, d1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u16x8(const uint16x8_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_u16(a);
+#else
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+ return vget_lane_u32(d, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) {
+#if AOM_ARCH_AARCH64
+ const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+ const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+ const uint16x8_t b0 = vpaddq_u16(a0, a1);
+ return vpaddlq_u16(b0);
+#else
+ const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+ const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+ const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+ const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+ const uint16x4_t b0 = vpadd_u16(a0, a1);
+ const uint16x4_t b1 = vpadd_u16(a2, a3);
+ return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
+static INLINE int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) {
+#if AOM_ARCH_AARCH64
+ const int16x8_t a0 = vpaddq_s16(sum[0], sum[1]);
+ const int16x8_t a1 = vpaddq_s16(sum[2], sum[3]);
+ const int16x8_t b0 = vpaddq_s16(a0, a1);
+ return vpaddlq_s16(b0);
+#else
+ const int16x4_t a0 = vadd_s16(vget_low_s16(sum[0]), vget_high_s16(sum[0]));
+ const int16x4_t a1 = vadd_s16(vget_low_s16(sum[1]), vget_high_s16(sum[1]));
+ const int16x4_t a2 = vadd_s16(vget_low_s16(sum[2]), vget_high_s16(sum[2]));
+ const int16x4_t a3 = vadd_s16(vget_low_s16(sum[3]), vget_high_s16(sum[3]));
+ const int16x4_t b0 = vpadd_s16(a0, a1);
+ const int16x4_t b1 = vpadd_s16(a2, a3);
+ return vpaddlq_s16(vcombine_s16(b0, b1));
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddv_u32(a);
+#else
+ const uint64x1_t b = vpaddl_u32(a);
+ return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x2(const uint32x2_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlv_u32(a);
+#else
+ const uint64x1_t b = vpaddl_u32(a);
+ return vget_lane_u64(b, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vaddlv_u16(a);
+#else
+ const uint32x2_t b = vpaddl_u16(a);
+ const uint64x1_t c = vpaddl_u32(b);
+ return vget_lane_u32(vreinterpret_u32_u64(c), 0);
+#endif
+}
+
+static INLINE int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
+#if AOM_ARCH_AARCH64
+ return vpaddq_s32(a, b);
+#else
+ const int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+ const int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+ return vcombine_s32(a0, b0);
+#endif
+}
+
+static INLINE int32x2_t add_pairwise_s32x4(int32x4_t a) {
+#if AOM_ARCH_AARCH64
+ return vget_low_s32(vpaddq_s32(a, a));
+#else
+ return vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) {
+ return horizontal_long_add_u32x4(a[0]) + horizontal_long_add_u32x4(a[1]);
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) {
+ uint64x2_t sum = vpaddlq_u32(a[0]);
+ sum = vpadalq_u32(sum, a[1]);
+ sum = vpadalq_u32(sum, a[2]);
+ sum = vpadalq_u32(sum, a[3]);
+
+ return horizontal_add_u64x2(sum);
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) {
+ uint64x2_t sum[2];
+ sum[0] = vpaddlq_u32(a[0]);
+ sum[1] = vpaddlq_u32(a[1]);
+ sum[0] = vpadalq_u32(sum[0], a[2]);
+ sum[1] = vpadalq_u32(sum[1], a[3]);
+ sum[0] = vpadalq_u32(sum[0], a[4]);
+ sum[1] = vpadalq_u32(sum[1], a[5]);
+ sum[0] = vpadalq_u32(sum[0], a[6]);
+ sum[1] = vpadalq_u32(sum[1], a[7]);
+
+ return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) {
+ uint64x2_t sum[2];
+ sum[0] = vpaddlq_u32(a[0]);
+ sum[1] = vpaddlq_u32(a[1]);
+ sum[0] = vpadalq_u32(sum[0], a[2]);
+ sum[1] = vpadalq_u32(sum[1], a[3]);
+ sum[0] = vpadalq_u32(sum[0], a[4]);
+ sum[1] = vpadalq_u32(sum[1], a[5]);
+ sum[0] = vpadalq_u32(sum[0], a[6]);
+ sum[1] = vpadalq_u32(sum[1], a[7]);
+ sum[0] = vpadalq_u32(sum[0], a[8]);
+ sum[1] = vpadalq_u32(sum[1], a[9]);
+ sum[0] = vpadalq_u32(sum[0], a[10]);
+ sum[1] = vpadalq_u32(sum[1], a[11]);
+ sum[0] = vpadalq_u32(sum[0], a[12]);
+ sum[1] = vpadalq_u32(sum[1], a[13]);
+ sum[0] = vpadalq_u32(sum[0], a[14]);
+ sum[1] = vpadalq_u32(sum[1], a[15]);
+
+ return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+#endif // AOM_AOM_DSP_ARM_SUM_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/sum_squares_neon.c b/third_party/aom/aom_dsp/arm/sum_squares_neon.c
new file mode 100644
index 0000000000..424b2b4445
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src,
+ int stride) {
+ int16x4_t s0 = vld1_s16(src + 0 * stride);
+ int16x4_t s1 = vld1_s16(src + 1 * stride);
+ int16x4_t s2 = vld1_s16(src + 2 * stride);
+ int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+ int32x4_t sum_squares = vmull_s16(s0, s0);
+ sum_squares = vmlal_s16(sum_squares, s1, s1);
+ sum_squares = vmlal_s16(sum_squares, s2, s2);
+ sum_squares = vmlal_s16(sum_squares, s3, s3);
+
+ return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sum_squares));
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src,
+ int stride, int height) {
+ int32x4_t sum_squares[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ int h = height;
+ do {
+ int16x4_t s0 = vld1_s16(src + 0 * stride);
+ int16x4_t s1 = vld1_s16(src + 1 * stride);
+ int16x4_t s2 = vld1_s16(src + 2 * stride);
+ int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+ sum_squares[0] = vmlal_s16(sum_squares[0], s0, s0);
+ sum_squares[0] = vmlal_s16(sum_squares[0], s1, s1);
+ sum_squares[1] = vmlal_s16(sum_squares[1], s2, s2);
+ sum_squares[1] = vmlal_s16(sum_squares[1], s3, s3);
+
+ src += 4 * stride;
+ h -= 4;
+ } while (h != 0);
+
+ return horizontal_long_add_u32x4(
+ vreinterpretq_u32_s32(vaddq_s32(sum_squares[0], sum_squares[1])));
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src,
+ int stride, int width,
+ int height) {
+ uint64x2_t sum_squares = vdupq_n_u64(0);
+
+ int h = height;
+ do {
+ int32x4_t ss_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int w = 0;
+ do {
+ const int16_t *s = src + w;
+ int16x8_t s0 = vld1q_s16(s + 0 * stride);
+ int16x8_t s1 = vld1q_s16(s + 1 * stride);
+ int16x8_t s2 = vld1q_s16(s + 2 * stride);
+ int16x8_t s3 = vld1q_s16(s + 3 * stride);
+
+ ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s0), vget_low_s16(s0));
+ ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s1), vget_low_s16(s1));
+ ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s2), vget_low_s16(s2));
+ ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s3), vget_low_s16(s3));
+ ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s0), vget_high_s16(s0));
+ ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s1), vget_high_s16(s1));
+ ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s2), vget_high_s16(s2));
+ ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s3), vget_high_s16(s3));
+ w += 8;
+ } while (w < width);
+
+ sum_squares = vpadalq_u32(
+ sum_squares, vreinterpretq_u32_s32(vaddq_s32(ss_row[0], ss_row[1])));
+
+ src += 4 * stride;
+ h -= 4;
+ } while (h != 0);
+
+ return horizontal_add_u64x2(sum_squares);
+}
+
+uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width,
+ int height) {
+ // 4 elements per row only requires half an SIMD register, so this
+ // must be a special case, but also note that over 75% of all calls
+ // are with size == 4, so it is also the common case.
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_squares_2d_i16_4x4_neon(src, stride);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_4xn_neon(src, stride, height);
+ } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+ // Generic case
+ return aom_sum_squares_2d_i16_nxn_neon(src, stride, width, height);
+ } else {
+ return aom_sum_squares_2d_i16_c(src, stride, width, height);
+ }
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4x4_neon(const int16_t *src,
+ int stride, int *sum) {
+ int16x4_t s0 = vld1_s16(src + 0 * stride);
+ int16x4_t s1 = vld1_s16(src + 1 * stride);
+ int16x4_t s2 = vld1_s16(src + 2 * stride);
+ int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+ int32x4_t sse = vmull_s16(s0, s0);
+ sse = vmlal_s16(sse, s1, s1);
+ sse = vmlal_s16(sse, s2, s2);
+ sse = vmlal_s16(sse, s3, s3);
+
+ int32x4_t sum_01 = vaddl_s16(s0, s1);
+ int32x4_t sum_23 = vaddl_s16(s2, s3);
+ *sum += horizontal_add_s32x4(vaddq_s32(sum_01, sum_23));
+
+ return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sse));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4xn_neon(const int16_t *src,
+ int stride, int height,
+ int *sum) {
+ int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int32x2_t sum_acc[2] = { vdup_n_s32(0), vdup_n_s32(0) };
+
+ int h = height;
+ do {
+ int16x4_t s0 = vld1_s16(src + 0 * stride);
+ int16x4_t s1 = vld1_s16(src + 1 * stride);
+ int16x4_t s2 = vld1_s16(src + 2 * stride);
+ int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+ sse[0] = vmlal_s16(sse[0], s0, s0);
+ sse[0] = vmlal_s16(sse[0], s1, s1);
+ sse[1] = vmlal_s16(sse[1], s2, s2);
+ sse[1] = vmlal_s16(sse[1], s3, s3);
+
+ sum_acc[0] = vpadal_s16(sum_acc[0], s0);
+ sum_acc[0] = vpadal_s16(sum_acc[0], s1);
+ sum_acc[1] = vpadal_s16(sum_acc[1], s2);
+ sum_acc[1] = vpadal_s16(sum_acc[1], s3);
+
+ src += 4 * stride;
+ h -= 4;
+ } while (h != 0);
+
+ *sum += horizontal_add_s32x4(vcombine_s32(sum_acc[0], sum_acc[1]));
+ return horizontal_long_add_u32x4(
+ vreinterpretq_u32_s32(vaddq_s32(sse[0], sse[1])));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_nxn_neon(const int16_t *src,
+ int stride, int width,
+ int height, int *sum) {
+ uint64x2_t sse = vdupq_n_u64(0);
+ int32x4_t sum_acc = vdupq_n_s32(0);
+
+ int h = height;
+ do {
+ int32x4_t sse_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int w = 0;
+ do {
+ const int16_t *s = src + w;
+ int16x8_t s0 = vld1q_s16(s + 0 * stride);
+ int16x8_t s1 = vld1q_s16(s + 1 * stride);
+ int16x8_t s2 = vld1q_s16(s + 2 * stride);
+ int16x8_t s3 = vld1q_s16(s + 3 * stride);
+
+ sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s0), vget_low_s16(s0));
+ sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s1), vget_low_s16(s1));
+ sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s2), vget_low_s16(s2));
+ sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s3), vget_low_s16(s3));
+ sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s0), vget_high_s16(s0));
+ sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s1), vget_high_s16(s1));
+ sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s2), vget_high_s16(s2));
+ sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s3), vget_high_s16(s3));
+
+ sum_acc = vpadalq_s16(sum_acc, s0);
+ sum_acc = vpadalq_s16(sum_acc, s1);
+ sum_acc = vpadalq_s16(sum_acc, s2);
+ sum_acc = vpadalq_s16(sum_acc, s3);
+
+ w += 8;
+ } while (w < width);
+
+ sse = vpadalq_u32(sse,
+ vreinterpretq_u32_s32(vaddq_s32(sse_row[0], sse_row[1])));
+
+ src += 4 * stride;
+ h -= 4;
+ } while (h != 0);
+
+ *sum += horizontal_add_s32x4(sum_acc);
+ return horizontal_add_u64x2(sse);
+}
+
+uint64_t aom_sum_sse_2d_i16_neon(const int16_t *src, int stride, int width,
+ int height, int *sum) {
+ uint64_t sse;
+
+ if (LIKELY(width == 4 && height == 4)) {
+ sse = aom_sum_sse_2d_i16_4x4_neon(src, stride, sum);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ // width = 4, height is a multiple of 4.
+ sse = aom_sum_sse_2d_i16_4xn_neon(src, stride, height, sum);
+ } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+ // Generic case - width is multiple of 8, height is multiple of 4.
+ sse = aom_sum_sse_2d_i16_nxn_neon(src, stride, width, height, sum);
+ } else {
+ sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum);
+ }
+
+ return sse;
+}
+
+static INLINE uint64_t aom_sum_squares_i16_4xn_neon(const int16_t *src,
+ uint32_t n) {
+ uint64x2_t sum_u64 = vdupq_n_u64(0);
+
+ int i = n;
+ do {
+ uint32x4_t sum;
+ int16x4_t s0 = vld1_s16(src);
+
+ sum = vreinterpretq_u32_s32(vmull_s16(s0, s0));
+
+ sum_u64 = vpadalq_u32(sum_u64, sum);
+
+ src += 4;
+ i -= 4;
+ } while (i >= 4);
+
+ if (i > 0) {
+ return horizontal_add_u64x2(sum_u64) + aom_sum_squares_i16_c(src, i);
+ }
+ return horizontal_add_u64x2(sum_u64);
+}
+
+static INLINE uint64_t aom_sum_squares_i16_8xn_neon(const int16_t *src,
+ uint32_t n) {
+ uint64x2_t sum_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ int i = n;
+ do {
+ uint32x4_t sum[2];
+ int16x8_t s0 = vld1q_s16(src);
+
+ sum[0] =
+ vreinterpretq_u32_s32(vmull_s16(vget_low_s16(s0), vget_low_s16(s0)));
+ sum[1] =
+ vreinterpretq_u32_s32(vmull_s16(vget_high_s16(s0), vget_high_s16(s0)));
+
+ sum_u64[0] = vpadalq_u32(sum_u64[0], sum[0]);
+ sum_u64[1] = vpadalq_u32(sum_u64[1], sum[1]);
+
+ src += 8;
+ i -= 8;
+ } while (i >= 8);
+
+ if (i > 0) {
+ return horizontal_add_u64x2(vaddq_u64(sum_u64[0], sum_u64[1])) +
+ aom_sum_squares_i16_c(src, i);
+ }
+ return horizontal_add_u64x2(vaddq_u64(sum_u64[0], sum_u64[1]));
+}
+
+uint64_t aom_sum_squares_i16_neon(const int16_t *src, uint32_t n) {
+ // This function seems to be called only for values of N >= 64. See
+ // av1/encoder/compound_type.c.
+ if (LIKELY(n >= 8)) {
+ return aom_sum_squares_i16_8xn_neon(src, n);
+ }
+ if (n >= 4) {
+ return aom_sum_squares_i16_4xn_neon(src, n);
+ }
+ return aom_sum_squares_i16_c(src, n);
+}
+
+static INLINE uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit
+ // element before we need to accumulate to 32-bit elements. Since we're
+ // accumulating in uint16x4_t vectors, this means we can accumulate up to 4
+ // rows of 256 elements. Therefore the limit can be computed as: h_limit = (4
+ // * 256) / width.
+ int h_limit = (4 * 256) / width;
+ int h_tmp = height > h_limit ? h_limit : height;
+
+ int h = 0;
+ do {
+ uint16x4_t sum_u16 = vdup_n_u16(0);
+ do {
+ uint8_t *src_ptr = src;
+ int w = width;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+
+ sum_u16 = vpadal_u8(sum_u16, s0);
+
+ uint16x8_t sse_u16 = vmull_u8(s0, s0);
+
+ sse_u32 = vpadalq_u16(sse_u32, sse_u16);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += 2 * src_stride;
+ h += 2;
+ } while (h < h_tmp && h < height);
+
+ sum_u32 = vpadal_u16(sum_u32, sum_u16);
+ h_tmp += h_limit;
+ } while (h < height);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_8xh_neon(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit
+ // element before we need to accumulate to 32-bit elements. Since we're
+ // accumulating in uint16x4_t vectors, this means we can accumulate up to 4
+ // rows of 256 elements. Therefore the limit can be computed as: h_limit = (4
+ // * 256) / width.
+ int h_limit = (4 * 256) / width;
+ int h_tmp = height > h_limit ? h_limit : height;
+
+ int h = 0;
+ do {
+ uint16x4_t sum_u16 = vdup_n_u16(0);
+ do {
+ uint8_t *src_ptr = src;
+ int w = width;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+
+ sum_u16 = vpadal_u8(sum_u16, s0);
+
+ uint16x8_t sse_u16 = vmull_u8(s0, s0);
+
+ sse_u32 = vpadalq_u16(sse_u32, sse_u16);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += src_stride;
+ ++h;
+ } while (h < h_tmp && h < height);
+
+ sum_u32 = vpadal_u16(sum_u32, sum_u16);
+ h_tmp += h_limit;
+ } while (h < height);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_16xh_neon(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit
+ // element before we need to accumulate to 32-bit elements. Since we're
+ // accumulating in uint16x8_t vectors, this means we can accumulate up to 8
+ // rows of 256 elements. Therefore the limit can be computed as: h_limit = (8
+ // * 256) / width.
+ int h_limit = (8 * 256) / width;
+ int h_tmp = height > h_limit ? h_limit : height;
+
+ int h = 0;
+ do {
+ uint16x8_t sum_u16 = vdupq_n_u16(0);
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+
+ sum_u16 = vpadalq_u8(sum_u16, s0);
+
+ uint16x8_t sse_u16_lo = vmull_u8(vget_low_u8(s0), vget_low_u8(s0));
+ uint16x8_t sse_u16_hi = vmull_u8(vget_high_u8(s0), vget_high_u8(s0));
+
+ sse_u32[0] = vpadalq_u16(sse_u32[0], sse_u16_lo);
+ sse_u32[1] = vpadalq_u16(sse_u32[1], sse_u16_hi);
+
+ src_ptr += 16;
+ w -= 16;
+ } while (w >= 16);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += src_stride;
+ ++h;
+ } while (h < h_tmp && h < height);
+
+ sum_u32 = vpadalq_u16(sum_u32, sum_u16);
+ h_tmp += h_limit;
+ } while (h < height);
+
+ sum += horizontal_long_add_u32x4(sum_u32);
+ sse += horizontal_long_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+
+ return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u8_neon(uint8_t *src, int src_stride, int width,
+ int height) {
+ if (width >= 16) {
+ return aom_var_2d_u8_16xh_neon(src, src_stride, width, height);
+ }
+ if (width >= 8) {
+ return aom_var_2d_u8_8xh_neon(src, src_stride, width, height);
+ }
+ if (width >= 4 && height % 2 == 0) {
+ return aom_var_2d_u8_4xh_neon(src, src_stride, width, height);
+ }
+ return aom_var_2d_u8_c(src, src_stride, width, height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_4xh_neon(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ int h = height;
+ do {
+ int w = width;
+ uint16_t *src_ptr = src_u16;
+ do {
+ uint16x4_t s0 = vld1_u16(src_ptr);
+
+ sum_u32 = vpadal_u16(sum_u32, s0);
+
+ uint32x4_t sse_u32 = vmull_u16(s0, s0);
+
+ sse_u64 = vpadalq_u32(sse_u64, sse_u32);
+
+ src_ptr += 4;
+ w -= 4;
+ } while (w >= 4);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint16_t v = src_u16[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src_u16 += src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_add_u64x2(sse_u64);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_8xh_neon(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ int h = height;
+ do {
+ int w = width;
+ uint16_t *src_ptr = src_u16;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+
+ sum_u32 = vpadalq_u16(sum_u32, s0);
+
+ uint32x4_t sse_u32_lo = vmull_u16(vget_low_u16(s0), vget_low_u16(s0));
+ uint32x4_t sse_u32_hi = vmull_u16(vget_high_u16(s0), vget_high_u16(s0));
+
+ sse_u64[0] = vpadalq_u32(sse_u64[0], sse_u32_lo);
+ sse_u64[1] = vpadalq_u32(sse_u64[1], sse_u32_hi);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint16_t v = src_u16[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src_u16 += src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x4(sum_u32);
+ sse += horizontal_add_u64x2(vaddq_u64(sse_u64[0], sse_u64[1]));
+
+ return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u16_neon(uint8_t *src, int src_stride, int width,
+ int height) {
+ if (width >= 8) {
+ return aom_var_2d_u16_8xh_neon(src, src_stride, width, height);
+ }
+ if (width >= 4) {
+ return aom_var_2d_u16_4xh_neon(src, src_stride, width, height);
+ }
+ return aom_var_2d_u16_c(src, src_stride, width, height);
+}
diff --git a/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c
new file mode 100644
index 0000000000..44462a693c
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_var_2d_u8_4xh_neon_dotprod(uint8_t *src,
+ int src_stride, int width,
+ int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint32x2_t sse_u32 = vdup_n_u32(0);
+
+ int h = height / 2;
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+
+ sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
+
+ sse_u32 = vdot_u32(sse_u32, s0, s0);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += 2 * src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_long_add_u32x2(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_8xh_neon_dotprod(uint8_t *src,
+ int src_stride, int width,
+ int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x2_t sum_u32 = vdup_n_u32(0);
+ uint32x2_t sse_u32 = vdup_n_u32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x8_t s0 = vld1_u8(src_ptr);
+
+ sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
+
+ sse_u32 = vdot_u32(sse_u32, s0, s0);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w >= 8);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x2(sum_u32);
+ sse += horizontal_long_add_u32x2(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_16xh_neon_dotprod(uint8_t *src,
+ int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int h = height;
+ do {
+ int w = width;
+ uint8_t *src_ptr = src;
+ do {
+ uint8x16_t s0 = vld1q_u8(src_ptr);
+
+ sum_u32 = vdotq_u32(sum_u32, s0, vdupq_n_u8(1));
+
+ sse_u32 = vdotq_u32(sse_u32, s0, s0);
+
+ src_ptr += 16;
+ w -= 16;
+ } while (w >= 16);
+
+ // Process remaining columns in the row using C.
+ while (w > 0) {
+ int idx = width - w;
+ const uint8_t v = src[idx];
+ sum += v;
+ sse += v * v;
+ w--;
+ }
+
+ src += src_stride;
+ } while (--h != 0);
+
+ sum += horizontal_long_add_u32x4(sum_u32);
+ sse += horizontal_long_add_u32x4(sse_u32);
+
+ return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u8_neon_dotprod(uint8_t *src, int src_stride, int width,
+ int height) {
+ if (width >= 16) {
+ return aom_var_2d_u8_16xh_neon_dotprod(src, src_stride, width, height);
+ }
+ if (width >= 8) {
+ return aom_var_2d_u8_8xh_neon_dotprod(src, src_stride, width, height);
+ }
+ if (width >= 4 && height % 2 == 0) {
+ return aom_var_2d_u8_4xh_neon_dotprod(src, src_stride, width, height);
+ }
+ return aom_var_2d_u8_c(src, src_stride, width, height);
+}
diff --git a/third_party/aom/aom_dsp/arm/sum_squares_sve.c b/third_party/aom/aom_dsp/arm/sum_squares_sve.c
new file mode 100644
index 0000000000..724e43859e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_squares_sve.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src,
+ int stride, int height) {
+ int64x2_t sum_squares = vdupq_n_s64(0);
+
+ do {
+ int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride));
+
+ sum_squares = aom_sdotq_s16(sum_squares, s, s);
+
+ src += 2 * stride;
+ height -= 2;
+ } while (height != 0);
+
+ return (uint64_t)vaddvq_s64(sum_squares);
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src,
+ int stride, int height) {
+ int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ int16x8_t s0 = vld1q_s16(src + 0 * stride);
+ int16x8_t s1 = vld1q_s16(src + 1 * stride);
+
+ sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0);
+ sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1);
+
+ src += 2 * stride;
+ height -= 2;
+ } while (height != 0);
+
+ sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]);
+ return (uint64_t)vaddvq_s64(sum_squares[0]);
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src,
+ int stride, int width,
+ int height) {
+ int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ const int16_t *src_ptr = src;
+ int w = width;
+ do {
+ int16x8_t s0 = vld1q_s16(src_ptr);
+ int16x8_t s1 = vld1q_s16(src_ptr + 8);
+
+ sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0);
+ sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1);
+
+ src_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ src += stride;
+ } while (--height != 0);
+
+ sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]);
+ return (uint64_t)vaddvq_s64(sum_squares[0]);
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_wxh_sve(const int16_t *src,
+ int stride, int width,
+ int height) {
+ svint64_t sum_squares = svdup_n_s64(0);
+ uint64_t step = svcnth();
+
+ do {
+ const int16_t *src_ptr = src;
+ int w = 0;
+ do {
+ svbool_t pred = svwhilelt_b16_u32(w, width);
+ svint16_t s0 = svld1_s16(pred, src_ptr);
+
+ sum_squares = svdot_s64(sum_squares, s0, s0);
+
+ src_ptr += step;
+ w += step;
+ } while (w < width);
+
+ src += stride;
+ } while (--height != 0);
+
+ return (uint64_t)svaddv_s64(svptrue_b64(), sum_squares);
+}
+
+uint64_t aom_sum_squares_2d_i16_sve(const int16_t *src, int stride, int width,
+ int height) {
+ if (width == 4) {
+ return aom_sum_squares_2d_i16_4xh_sve(src, stride, height);
+ }
+ if (width == 8) {
+ return aom_sum_squares_2d_i16_8xh_sve(src, stride, height);
+ }
+ if (width % 16 == 0) {
+ return aom_sum_squares_2d_i16_large_sve(src, stride, width, height);
+ }
+ return aom_sum_squares_2d_i16_wxh_sve(src, stride, width, height);
+}
+
+uint64_t aom_sum_squares_i16_sve(const int16_t *src, uint32_t n) {
+ // This function seems to be called only for values of N >= 64. See
+ // av1/encoder/compound_type.c. Additionally, because N = width x height for
+ // width and height between the standard block sizes, N will also be a
+ // multiple of 64.
+ if (LIKELY(n % 64 == 0)) {
+ int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+ vdupq_n_s64(0) };
+
+ do {
+ int16x8_t s0 = vld1q_s16(src);
+ int16x8_t s1 = vld1q_s16(src + 8);
+ int16x8_t s2 = vld1q_s16(src + 16);
+ int16x8_t s3 = vld1q_s16(src + 24);
+
+ sum[0] = aom_sdotq_s16(sum[0], s0, s0);
+ sum[1] = aom_sdotq_s16(sum[1], s1, s1);
+ sum[2] = aom_sdotq_s16(sum[2], s2, s2);
+ sum[3] = aom_sdotq_s16(sum[3], s3, s3);
+
+ src += 32;
+ n -= 32;
+ } while (n != 0);
+
+ sum[0] = vaddq_s64(sum[0], sum[1]);
+ sum[2] = vaddq_s64(sum[2], sum[3]);
+ sum[0] = vaddq_s64(sum[0], sum[2]);
+ return vaddvq_s64(sum[0]);
+ }
+ return aom_sum_squares_i16_c(src, n);
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src,
+ int stride, int height,
+ int *sum) {
+ int64x2_t sse = vdupq_n_s64(0);
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+
+ do {
+ int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride));
+
+ sse = aom_sdotq_s16(sse, s, s);
+
+ sum_s32 = vpadalq_s16(sum_s32, s);
+
+ src += 2 * stride;
+ height -= 2;
+ } while (height != 0);
+
+ *sum += vaddvq_s32(sum_s32);
+ return vaddvq_s64(sse);
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src,
+ int stride, int height,
+ int *sum) {
+ int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+ int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ do {
+ int16x8_t s0 = vld1q_s16(src);
+ int16x8_t s1 = vld1q_s16(src + stride);
+
+ sse[0] = aom_sdotq_s16(sse[0], s0, s0);
+ sse[1] = aom_sdotq_s16(sse[1], s1, s1);
+
+ sum_acc[0] = vpadalq_s16(sum_acc[0], s0);
+ sum_acc[1] = vpadalq_s16(sum_acc[1], s1);
+
+ src += 2 * stride;
+ height -= 2;
+ } while (height != 0);
+
+ *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1]));
+ return vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_16xh_sve(const int16_t *src,
+ int stride, int width,
+ int height, int *sum) {
+ int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+ int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ do {
+ int w = 0;
+ do {
+ int16x8_t s0 = vld1q_s16(src + w);
+ int16x8_t s1 = vld1q_s16(src + w + 8);
+
+ sse[0] = aom_sdotq_s16(sse[0], s0, s0);
+ sse[1] = aom_sdotq_s16(sse[1], s1, s1);
+
+ sum_acc[0] = vpadalq_s16(sum_acc[0], s0);
+ sum_acc[1] = vpadalq_s16(sum_acc[1], s1);
+
+ w += 16;
+ } while (w < width);
+
+ src += stride;
+ } while (--height != 0);
+
+ *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1]));
+ return vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+uint64_t aom_sum_sse_2d_i16_sve(const int16_t *src, int stride, int width,
+ int height, int *sum) {
+ uint64_t sse;
+
+ if (width == 4) {
+ sse = aom_sum_sse_2d_i16_4xh_sve(src, stride, height, sum);
+ } else if (width == 8) {
+ sse = aom_sum_sse_2d_i16_8xh_sve(src, stride, height, sum);
+ } else if (width % 16 == 0) {
+ sse = aom_sum_sse_2d_i16_16xh_sve(src, stride, width, height, sum);
+ } else {
+ sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum);
+ }
+
+ return sse;
+}
+
+static INLINE uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ int h = height;
+ do {
+ uint16x8_t s0 =
+ vcombine_u16(vld1_u16(src_u16), vld1_u16(src_u16 + src_stride));
+
+ sum_u32 = vpadalq_u16(sum_u32, s0);
+
+ sse_u64 = aom_udotq_u16(sse_u64, s0, s0);
+
+ src_u16 += 2 * src_stride;
+ h -= 2;
+ } while (h != 0);
+
+ sum += vaddlvq_u32(sum_u32);
+ sse += vaddvq_u64(sse_u64);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32 = vdupq_n_u32(0);
+ uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+ int h = height;
+ do {
+ int w = width;
+ uint16_t *src_ptr = src_u16;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+
+ sum_u32 = vpadalq_u16(sum_u32, s0);
+
+ sse_u64 = aom_udotq_u16(sse_u64, s0, s0);
+
+ src_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src_u16 += src_stride;
+ } while (--h != 0);
+
+ sum += vaddlvq_u32(sum_u32);
+ sse += vaddvq_u64(sse_u64);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ int h = height;
+ do {
+ int w = width;
+ uint16_t *src_ptr = src_u16;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+
+ sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+ sum_u32[1] = vpadalq_u16(sum_u32[1], s1);
+
+ sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0);
+ sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1);
+
+ src_ptr += 16;
+ w -= 16;
+ } while (w != 0);
+
+ src_u16 += src_stride;
+ } while (--h != 0);
+
+ sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]);
+ sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]);
+
+ sum += vaddlvq_u32(sum_u32[0]);
+ sse += vaddvq_u64(sse_u64[0]);
+
+ return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_large_sve(uint8_t *src, int src_stride,
+ int width, int height) {
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+ uint64_t sum = 0;
+ uint64_t sse = 0;
+ uint32x4_t sum_u32[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+ vdupq_n_u32(0) };
+ uint64x2_t sse_u64[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+ vdupq_n_u64(0) };
+
+ int h = height;
+ do {
+ int w = width;
+ uint16_t *src_ptr = src_u16;
+ do {
+ uint16x8_t s0 = vld1q_u16(src_ptr);
+ uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+ uint16x8_t s2 = vld1q_u16(src_ptr + 16);
+ uint16x8_t s3 = vld1q_u16(src_ptr + 24);
+
+ sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+ sum_u32[1] = vpadalq_u16(sum_u32[1], s1);
+ sum_u32[2] = vpadalq_u16(sum_u32[2], s2);
+ sum_u32[3] = vpadalq_u16(sum_u32[3], s3);
+
+ sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0);
+ sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1);
+ sse_u64[2] = aom_udotq_u16(sse_u64[2], s2, s2);
+ sse_u64[3] = aom_udotq_u16(sse_u64[3], s3, s3);
+
+ src_ptr += 32;
+ w -= 32;
+ } while (w != 0);
+
+ src_u16 += src_stride;
+ } while (--h != 0);
+
+ sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]);
+ sum_u32[2] = vaddq_u32(sum_u32[2], sum_u32[3]);
+ sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[2]);
+ sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]);
+ sse_u64[2] = vaddq_u64(sse_u64[2], sse_u64[3]);
+ sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[2]);
+
+ sum += vaddlvq_u32(sum_u32[0]);
+ sse += vaddvq_u64(sse_u64[0]);
+
+ return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u16_sve(uint8_t *src, int src_stride, int width,
+ int height) {
+ if (width == 4) {
+ return aom_var_2d_u16_4xh_sve(src, src_stride, width, height);
+ }
+ if (width == 8) {
+ return aom_var_2d_u16_8xh_sve(src, src_stride, width, height);
+ }
+ if (width == 16) {
+ return aom_var_2d_u16_16xh_sve(src, src_stride, width, height);
+ }
+ if (width % 32 == 0) {
+ return aom_var_2d_u16_large_sve(src, src_stride, width, height);
+ }
+ return aom_var_2d_u16_neon(src, src_stride, width, height);
+}
diff --git a/third_party/aom/aom_dsp/arm/transpose_neon.h b/third_party/aom/aom_dsp/arm/transpose_neon.h
new file mode 100644
index 0000000000..8027018235
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/transpose_neon.h
@@ -0,0 +1,1263 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
+#define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h" // For AOM_FORCE_INLINE.
+#include "config/aom_config.h"
+
+static INLINE void transpose_elems_u8_8x8(
+ uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
+ uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
+ uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
+ uint8x8_t *o7) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
+ // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
+ // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
+ // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
+
+ const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5));
+ const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
+ // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
+ // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
+ // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ // Unzip 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+ *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+ *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+ *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+ *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
+ uint8x8_t *a2, uint8x8_t *a3,
+ uint8x8_t *a4, uint8x8_t *a5,
+ uint8x8_t *a6,
+ uint8x8_t *a7) {
+ transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3,
+ a4, a5, a6, a7);
+}
+
+static INLINE void transpose_arrays_u8_8x8(const uint8x8_t *in,
+ uint8x8_t *out) {
+ transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
+ &out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x,
+ uint8x16_t *d) {
+ uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
+ uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
+ uint8x8x2_t w2 = vzip_u8(x[4], x[5]);
+ uint8x8x2_t w3 = vzip_u8(x[6], x[7]);
+
+ uint8x8x2_t w8 = vzip_u8(x[8], x[9]);
+ uint8x8x2_t w9 = vzip_u8(x[10], x[11]);
+ uint8x8x2_t w10 = vzip_u8(x[12], x[13]);
+ uint8x8x2_t w11 = vzip_u8(x[14], x[15]);
+
+ uint16x4x2_t w4 =
+ vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+ uint16x4x2_t w5 =
+ vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
+ uint16x4x2_t w12 =
+ vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
+ uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
+ vreinterpret_u16_u8(w11.val[0]));
+
+ uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+ vreinterpret_u32_u16(w5.val[0]));
+ uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+ vreinterpret_u32_u16(w5.val[1]));
+ uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
+ vreinterpret_u32_u16(w13.val[0]));
+ uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
+ vreinterpret_u32_u16(w13.val[1]));
+
+ // Store first 4-line result
+ d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
+ d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
+ d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
+ d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
+
+ w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
+ w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
+ w12 =
+ vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
+ w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
+ vreinterpret_u16_u8(w11.val[1]));
+
+ w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+ vreinterpret_u32_u16(w5.val[0]));
+ w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+ vreinterpret_u32_u16(w5.val[1]));
+ w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
+ vreinterpret_u32_u16(w13.val[0]));
+ w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
+ vreinterpret_u32_u16(w13.val[1]));
+
+ // Store second 4-line result
+ d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
+ d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
+ d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
+ d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x,
+ uint8x8_t *d) {
+ uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
+ uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
+ uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
+ uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
+
+ uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+ vreinterpretq_u16_u8(w1.val[0]));
+ uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+ vreinterpretq_u16_u8(w3.val[0]));
+ uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+ vreinterpretq_u16_u8(w1.val[1]));
+ uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+ vreinterpretq_u16_u8(w3.val[1]));
+
+ uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+ vreinterpretq_u32_u16(w5.val[0]));
+ uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
+ vreinterpretq_u32_u16(w7.val[0]));
+ uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+ vreinterpretq_u32_u16(w5.val[1]));
+ uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
+ vreinterpretq_u32_u16(w7.val[1]));
+
+ d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0]));
+ d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0]));
+ d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1]));
+ d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1]));
+ d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0]));
+ d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0]));
+ d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1]));
+ d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1]));
+ d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0]));
+ d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0]));
+ d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1]));
+ d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1]));
+ d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0]));
+ d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0]));
+ d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1]));
+ d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1]));
+}
+
+static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
+ uint16x8x2_t b0;
+#if AOM_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_u16_u64(
+ vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+ b0.val[1] = vreinterpretq_u16_u64(
+ vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
+ b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+ vreinterpret_u16_u32(vget_low_u32(a1)));
+ b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+ vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE void transpose_arrays_u8_16x16(const uint8x16_t *x,
+ uint8x16_t *d) {
+ uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
+ uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
+ uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
+ uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
+
+ uint8x16x2_t w4 = vzipq_u8(x[8], x[9]);
+ uint8x16x2_t w5 = vzipq_u8(x[10], x[11]);
+ uint8x16x2_t w6 = vzipq_u8(x[12], x[13]);
+ uint8x16x2_t w7 = vzipq_u8(x[14], x[15]);
+
+ uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+ vreinterpretq_u16_u8(w1.val[0]));
+ uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+ vreinterpretq_u16_u8(w3.val[0]));
+ uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
+ vreinterpretq_u16_u8(w5.val[0]));
+ uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
+ vreinterpretq_u16_u8(w7.val[0]));
+
+ uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
+ vreinterpretq_u32_u16(w9.val[0]));
+ uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
+ vreinterpretq_u32_u16(w11.val[0]));
+ uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
+ vreinterpretq_u32_u16(w9.val[1]));
+ uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
+ vreinterpretq_u32_u16(w11.val[1]));
+
+ uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
+ d[0] = vreinterpretq_u8_u16(d01.val[0]);
+ d[1] = vreinterpretq_u8_u16(d01.val[1]);
+ uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
+ d[2] = vreinterpretq_u8_u16(d23.val[0]);
+ d[3] = vreinterpretq_u8_u16(d23.val[1]);
+ uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
+ d[4] = vreinterpretq_u8_u16(d45.val[0]);
+ d[5] = vreinterpretq_u8_u16(d45.val[1]);
+ uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
+ d[6] = vreinterpretq_u8_u16(d67.val[0]);
+ d[7] = vreinterpretq_u8_u16(d67.val[1]);
+
+ // upper half
+ w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+ vreinterpretq_u16_u8(w1.val[1]));
+ w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+ vreinterpretq_u16_u8(w3.val[1]));
+ w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
+ vreinterpretq_u16_u8(w5.val[1]));
+ w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
+ vreinterpretq_u16_u8(w7.val[1]));
+
+ w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
+ vreinterpretq_u32_u16(w9.val[0]));
+ w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
+ vreinterpretq_u32_u16(w11.val[0]));
+ w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
+ vreinterpretq_u32_u16(w9.val[1]));
+ w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
+ vreinterpretq_u32_u16(w11.val[1]));
+
+ d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
+ d[8] = vreinterpretq_u8_u16(d01.val[0]);
+ d[9] = vreinterpretq_u8_u16(d01.val[1]);
+ d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
+ d[10] = vreinterpretq_u8_u16(d23.val[0]);
+ d[11] = vreinterpretq_u8_u16(d23.val[1]);
+ d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
+ d[12] = vreinterpretq_u8_u16(d45.val[0]);
+ d[13] = vreinterpretq_u8_u16(d45.val[1]);
+ d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
+ d[14] = vreinterpretq_u8_u16(d67.val[0]);
+ d[15] = vreinterpretq_u8_u16(d67.val[1]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x,
+ uint8x16_t *d) {
+ uint8x16_t x2[32];
+ for (int i = 0; i < 16; ++i) {
+ x2[i] = x[i].val[0];
+ x2[i + 16] = x[i].val[1];
+ }
+ transpose_arrays_u8_16x16(x2, d);
+ transpose_arrays_u8_16x16(x2 + 16, d + 16);
+}
+
+static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
+ uint8x8_t *a2,
+ uint8x8_t *a3) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+ const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const uint16x4x2_t c0 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+ const uint16x4x2_t c1 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+ *a0 = vreinterpret_u8_u16(c0.val[0]);
+ *a1 = vreinterpret_u8_u16(c1.val[0]);
+ *a2 = vreinterpret_u8_u16(c0.val[1]);
+ *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
+ uint8x8_t *a1) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 10 11 12 13
+ // a1: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const uint16x4x2_t b0 =
+ vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 02 03 22 23
+ // c0.val[1]: 10 11 30 31 12 13 32 33
+
+ const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+ vreinterpret_u32_u16(b0.val[1]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 02 12 22 32
+ // d0.val[1]: 01 11 21 31 03 13 23 33
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+}
+
+static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
+ uint8x8_t a2, uint8x8_t a3,
+ uint8x8_t a4, uint8x8_t a5,
+ uint8x8_t a6, uint8x8_t a7,
+ uint8x8_t *o0, uint8x8_t *o1,
+ uint8x8_t *o2, uint8x8_t *o3) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 XX XX XX XX
+ // a1: 10 11 12 13 XX XX XX XX
+ // a2: 20 21 22 23 XX XX XX XX
+ // a3; 30 31 32 33 XX XX XX XX
+ // a4: 40 41 42 43 XX XX XX XX
+ // a5: 50 51 52 53 XX XX XX XX
+ // a6: 60 61 62 63 XX XX XX XX
+ // a7: 70 71 72 73 XX XX XX XX
+ // to:
+ // b0.val[0]: 00 01 02 03 40 41 42 43
+ // b1.val[0]: 10 11 12 13 50 51 52 53
+ // b2.val[0]: 20 21 22 23 60 61 62 63
+ // b3.val[0]: 30 31 32 33 70 71 72 73
+
+ const uint32x2x2_t b0 =
+ vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
+ const uint32x2x2_t b1 =
+ vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
+ const uint32x2x2_t b2 =
+ vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
+ const uint32x2x2_t b3 =
+ vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 40 41 60 61
+ // c0.val[1]: 02 03 22 23 42 43 62 63
+ // c1.val[0]: 10 11 30 31 50 51 70 71
+ // c1.val[1]: 12 13 32 33 52 53 72 73
+
+ const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+ vreinterpret_u16_u32(b2.val[0]));
+ const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+ vreinterpret_u16_u32(b3.val[0]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 01 11 21 31 41 51 61 71
+ // d1.val[0]: 02 12 22 32 42 52 62 72
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+ const uint8x8x2_t d1 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+ *o0 = d0.val[0];
+ *o1 = d0.val[1];
+ *o2 = d1.val[0];
+ *o3 = d1.val[1];
+}
+
+static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
+ // Input:
+ // 00 01 02 03
+ // 10 11 12 13
+ // 20 21 22 23
+ // 30 31 32 33
+
+ // b:
+ // 00 10 02 12
+ // 01 11 03 13
+ const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+ // c:
+ // 20 30 22 32
+ // 21 31 23 33
+ const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+ // d:
+ // 00 10 20 30
+ // 02 12 22 32
+ const uint32x2x2_t d =
+ vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+ // e:
+ // 01 11 21 31
+ // 03 13 23 33
+ const uint32x2x2_t e =
+ vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+
+ // Output:
+ // 00 10 20 30
+ // 01 11 21 31
+ // 02 12 22 32
+ // 03 13 23 33
+ a[0] = vreinterpret_u16_u32(d.val[0]);
+ a[1] = vreinterpret_u16_u32(e.val[0]);
+ a[2] = vreinterpret_u16_u32(d.val[1]);
+ a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
+ // 4x8 Input:
+ // a[0]: 00 01 02 03 04 05 06 07
+ // a[1]: 10 11 12 13 14 15 16 17
+ // a[2]: 20 21 22 23 24 25 26 27
+ // a[3]: 30 31 32 33 34 35 36 37
+
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+ const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+
+ // 8x4 Output:
+ // a[0]: 00 10 20 30 04 14 24 34
+ // a[1]: 01 11 21 31 05 15 25 35
+ // a[2]: 02 12 22 32 06 16 26 36
+ // a[3]: 03 13 23 33 07 17 27 37
+ a[0] = vreinterpretq_u16_u32(c0.val[0]);
+ a[1] = vreinterpretq_u16_u32(c1.val[0]);
+ a[2] = vreinterpretq_u16_u32(c0.val[1]);
+ a[3] = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+// Special transpose for loop filter.
+// 4x8 Input:
+// p_q: p3 p2 p1 p0 q0 q1 q2 q3
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 03 13 23 33 04 14 24 34 p0q0
+// a[1]: 02 12 22 32 05 15 25 35 p1q1
+// a[2]: 01 11 21 31 06 16 26 36 p2q2
+// a[3]: 00 10 20 30 07 17 27 37 p3q3
+// Direct reapplication of the function will reset the high halves, but
+// reverse the low halves:
+// p_q: p0 p1 p2 p3 q0 q1 q2 q3
+// a[0]: 33 32 31 30 04 05 06 07
+// a[1]: 23 22 21 20 14 15 16 17
+// a[2]: 13 12 11 10 24 25 26 27
+// a[3]: 03 02 01 00 34 35 36 37
+// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
+// reverse the high halves.
+// The standard transpose_u16_4x8q will produce the same reversals, but with the
+// order of the low halves also restored relative to the high halves. This is
+// preferable because it puts all values from the same source row back together,
+// but some post-processing is inevitable.
+static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+ const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+ // Reverse odd vectors to bring the appropriate items to the front of zips.
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // r0 : 03 13 01 11 07 17 05 15
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // r1 : 23 33 21 31 27 37 25 35
+ const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
+ const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
+
+ // Zip to complete the halves.
+ // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1
+ // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2
+ // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2
+ // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1
+ const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vzipq_u32(r0, r1);
+
+ // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3
+ // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1
+ // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0
+ // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2
+ const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
+ // The third row of c comes first here to swap p2 with q0.
+ const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
+
+ // 8x4 Output:
+ // a[0]: 03 13 23 33 04 14 24 34 p0q0
+ // a[1]: 02 12 22 32 05 15 25 35 p1q1
+ // a[2]: 01 11 21 31 06 16 26 36 p2q2
+ // a[3]: 00 10 20 30 07 17 27 37 p3q3
+ a[0] = d1.val[0]; // p0q0
+ a[1] = d0.val[1]; // p1q1
+ a[2] = d1.val[1]; // p2q2
+ a[3] = d0.val[0]; // p3q3
+}
+
+static INLINE void transpose_elems_u16_4x8(
+ const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
+ const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
+ const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
+ uint16x8_t *o2, uint16x8_t *o3) {
+ // Combine rows. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0: 00 01 02 03 40 41 42 43
+ // b1: 10 11 12 13 50 51 52 53
+ // b2: 20 21 22 23 60 61 62 63
+ // b3: 30 31 32 33 70 71 72 73
+
+ const uint16x8_t b0 = vcombine_u16(a0, a4);
+ const uint16x8_t b1 = vcombine_u16(a1, a5);
+ const uint16x8_t b2 = vcombine_u16(a2, a6);
+ const uint16x8_t b3 = vcombine_u16(a3, a7);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 02 12 40 50 42 52
+ // c0.val[1]: 01 11 03 13 41 51 43 53
+ // c1.val[0]: 20 30 22 32 60 70 62 72
+ // c1.val[1]: 21 31 23 33 61 71 63 73
+
+ const uint16x8x2_t c0 = vtrnq_u16(b0, b1);
+ const uint16x8x2_t c1 = vtrnq_u16(b2, b3);
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 02 12 22 32 42 52 62 72
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ *o0 = vreinterpretq_u16_u32(d0.val[0]);
+ *o1 = vreinterpretq_u16_u32(d1.val[0]);
+ *o2 = vreinterpretq_u16_u32(d0.val[1]);
+ *o3 = vreinterpretq_u16_u32(d1.val[1]);
+}
+
+static INLINE void transpose_elems_s16_4x8(
+ const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
+ const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
+ const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
+ int16x8_t *o2, int16x8_t *o3) {
+ // Combine rows. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0: 00 01 02 03 40 41 42 43
+ // b1: 10 11 12 13 50 51 52 53
+ // b2: 20 21 22 23 60 61 62 63
+ // b3: 30 31 32 33 70 71 72 73
+
+ const int16x8_t b0 = vcombine_s16(a0, a4);
+ const int16x8_t b1 = vcombine_s16(a1, a5);
+ const int16x8_t b2 = vcombine_s16(a2, a6);
+ const int16x8_t b3 = vcombine_s16(a3, a7);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 02 12 40 50 42 52
+ // c0.val[1]: 01 11 03 13 41 51 43 53
+ // c1.val[0]: 20 30 22 32 60 70 62 72
+ // c1.val[1]: 21 31 23 33 61 71 63 73
+
+ const int16x8x2_t c0 = vtrnq_s16(b0, b1);
+ const int16x8x2_t c1 = vtrnq_s16(b2, b3);
+
+ // Swap 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 02 12 22 32 42 52 62 72
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+
+ *o0 = vreinterpretq_s16_s32(d0.val[0]);
+ *o1 = vreinterpretq_s16_s32(d1.val[0]);
+ *o2 = vreinterpretq_s16_s32(d0.val[1]);
+ *o3 = vreinterpretq_s16_s32(d1.val[1]);
+}
+
+static INLINE void transpose_elems_inplace_u16_8x8(
+ uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
+ uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+ const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+ const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+ const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+ const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+ vreinterpretq_u32_u16(b3.val[0]));
+ const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+ vreinterpretq_u32_u16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
+ const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
+ const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
+ const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d1.val[0];
+ *a2 = d2.val[0];
+ *a3 = d3.val[0];
+ *a4 = d0.val[1];
+ *a5 = d1.val[1];
+ *a6 = d2.val[1];
+ *a7 = d3.val[1];
+}
+
+static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+ int16x8x2_t b0;
+#if AOM_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_s16_s64(
+ vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+ b0.val[1] = vreinterpretq_s16_s64(
+ vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+ b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+ vreinterpret_s16_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+ vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
+ return b0;
+}
+
+static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6,
+ int16x8_t *a7) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+ const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+ const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
+ const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ *a0 = d0.val[0];
+ *a1 = d1.val[0];
+ *a2 = d2.val[0];
+ *a3 = d3.val[0];
+ *a4 = d0.val[1];
+ *a5 = d1.val[1];
+ *a6 = d2.val[1];
+ *a7 = d3.val[1];
+}
+
+static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
+ int16x8_t *out) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ out[0] = d0.val[0];
+ out[1] = d1.val[0];
+ out[2] = d2.val[0];
+ out[3] = d3.val[0];
+ out[4] = d0.val[1];
+ out[5] = d1.val[1];
+ out[6] = d2.val[1];
+ out[7] = d3.val[1];
+}
+
+static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
+ uint16x4_t *a1,
+ uint16x4_t *a2,
+ uint16x4_t *a3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
+ const uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+ vreinterpret_u32_u16(b1.val[0]));
+ const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+ vreinterpret_u32_u16(b1.val[1]));
+
+ *a0 = vreinterpret_u16_u32(c0.val[0]);
+ *a1 = vreinterpret_u16_u32(c1.val[0]);
+ *a2 = vreinterpret_u16_u32(c0.val[1]);
+ *a3 = vreinterpret_u16_u32(c1.val[1]);
+}
+
+static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
+ int16x4_t *a2,
+ int16x4_t *a3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+ const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+ vreinterpret_s32_s16(b1.val[0]));
+ const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+ vreinterpret_s32_s16(b1.val[1]));
+
+ *a0 = vreinterpret_s16_s32(c0.val[0]);
+ *a1 = vreinterpret_s16_s32(c1.val[0]);
+ *a2 = vreinterpret_s16_s32(c0.val[1]);
+ *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+ int32x4x2_t b0;
+#if AOM_ARCH_AARCH64
+ b0.val[0] = vreinterpretq_s32_s64(
+ vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+ b0.val[1] = vreinterpretq_s32_s64(
+ vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+ b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+ b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
+ return b0;
+}
+
+static INLINE void transpose_elems_s32_4x4(const int32x4_t a0,
+ const int32x4_t a1,
+ const int32x4_t a2,
+ const int32x4_t a3, int32x4_t *o0,
+ int32x4_t *o1, int32x4_t *o2,
+ int32x4_t *o3) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int32x4x2_t b0 = vtrnq_s32(a0, a1);
+ const int32x4x2_t b1 = vtrnq_s32(a2, a3);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+ *o0 = c0.val[0];
+ *o1 = c1.val[0];
+ *o2 = c0.val[1];
+ *o3 = c1.val[1];
+}
+
+static INLINE void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+ int32x4_t *a2,
+ int32x4_t *a3) {
+ transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
+}
+
+static INLINE void transpose_arrays_s32_4x4(const int32x4_t *in,
+ int32x4_t *out) {
+ transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
+ &out[3]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
+ int32x4_t *out,
+ const int width,
+ const int height) {
+ const int h = height >> 2;
+ const int w = width >> 2;
+ for (int j = 0; j < w; j++) {
+ for (int i = 0; i < h; i++) {
+ transpose_arrays_s32_4x4(in + j * height + i * 4,
+ out + i * width + j * 4);
+ }
+ }
+}
+
+#define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h) \
+ static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
+ const int32x4_t *in, int32x4_t *out) { \
+ transpose_arrays_s32_4nx4n(in, out, w, h); \
+ }
+
+TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
+
+#undef TRANSPOSE_ARRAYS_S32_WXH_NEON
+
+static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
+#if AOM_ARCH_AARCH64
+ return vtrn1q_s64(a, b);
+#else
+ return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
+#endif
+}
+
+static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
+#if AOM_ARCH_AARCH64
+ return vtrn2q_s64(a, b);
+#else
+ return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
+#endif
+}
+
+static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
+ int32x4_t a2, int32x4_t a3,
+ int32x4_t a4, int32x4_t a5,
+ int32x4_t a6, int32x4_t a7,
+ int32x4x2_t *o0, int32x4x2_t *o1,
+ int32x4x2_t *o2, int32x4x2_t *o3) {
+ // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
+ // matrix transpose implementation:
+ // [ A ]^T => [ A^T B^T ]
+ // [ B ]
+
+ transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3); // A^T
+ transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7); // B^T
+
+ o0->val[0] = a0;
+ o1->val[0] = a1;
+ o2->val[0] = a2;
+ o3->val[0] = a3;
+
+ o0->val[1] = a4;
+ o1->val[1] = a5;
+ o2->val[1] = a6;
+ o3->val[1] = a7;
+}
+
+static INLINE void transpose_elems_inplace_s32_8x8(
+ int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
+ int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
+ // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
+ // matrix transpose implementation:
+ // [ A B ]^T => [ A^T C^T ]
+ // [ C D ] [ B^T D^T ]
+
+ int32x4_t q0_v1 = a0->val[0];
+ int32x4_t q0_v2 = a1->val[0];
+ int32x4_t q0_v3 = a2->val[0];
+ int32x4_t q0_v4 = a3->val[0];
+
+ int32x4_t q1_v1 = a0->val[1];
+ int32x4_t q1_v2 = a1->val[1];
+ int32x4_t q1_v3 = a2->val[1];
+ int32x4_t q1_v4 = a3->val[1];
+
+ int32x4_t q2_v1 = a4->val[0];
+ int32x4_t q2_v2 = a5->val[0];
+ int32x4_t q2_v3 = a6->val[0];
+ int32x4_t q2_v4 = a7->val[0];
+
+ int32x4_t q3_v1 = a4->val[1];
+ int32x4_t q3_v2 = a5->val[1];
+ int32x4_t q3_v3 = a6->val[1];
+ int32x4_t q3_v4 = a7->val[1];
+
+ transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4); // A^T
+ transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4); // B^T
+ transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4); // C^T
+ transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4); // D^T
+
+ a0->val[0] = q0_v1;
+ a1->val[0] = q0_v2;
+ a2->val[0] = q0_v3;
+ a3->val[0] = q0_v4;
+
+ a0->val[1] = q2_v1;
+ a1->val[1] = q2_v2;
+ a2->val[1] = q2_v3;
+ a3->val[1] = q2_v4;
+
+ a4->val[0] = q1_v1;
+ a5->val[0] = q1_v2;
+ a6->val[0] = q1_v3;
+ a7->val[0] = q1_v4;
+
+ a4->val[1] = q3_v1;
+ a5->val[1] = q3_v2;
+ a6->val[1] = q3_v3;
+ a7->val[1] = q3_v4;
+}
+
+static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in,
+ int16x4_t *const out) {
+ int16x4_t a0 = in[0];
+ int16x4_t a1 = in[1];
+ int16x4_t a2 = in[2];
+ int16x4_t a3 = in[3];
+
+ transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
+
+ out[0] = a0;
+ out[1] = a1;
+ out[2] = a2;
+ out[3] = a3;
+}
+
+static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in,
+ int16x8_t *const out) {
+#if AOM_ARCH_AARCH64
+ const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
+ vcombine_s16(in[1], vdup_n_s16(0)));
+ const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
+ vcombine_s16(in[3], vdup_n_s16(0)));
+ const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
+ vcombine_s16(in[5], vdup_n_s16(0)));
+ const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
+ vcombine_s16(in[7], vdup_n_s16(0)));
+#else
+ int16x4x2_t temp;
+ temp = vzip_s16(in[0], in[1]);
+ const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
+ temp = vzip_s16(in[2], in[3]);
+ const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
+ temp = vzip_s16(in[4], in[5]);
+ const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
+ temp = vzip_s16(in[6], in[7]);
+ const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
+#endif
+
+ const int32x4x2_t b02 =
+ vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
+ const int32x4x2_t b13 =
+ vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
+
+#if AOM_ARCH_AARCH64
+ out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
+ vreinterpretq_s64_s32(b13.val[0])));
+ out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
+ vreinterpretq_s64_s32(b13.val[0])));
+ out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
+ vreinterpretq_s64_s32(b13.val[1])));
+ out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
+ vreinterpretq_s64_s32(b13.val[1])));
+#else
+ out[0] = vreinterpretq_s16_s32(
+ vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
+ out[2] = vreinterpretq_s16_s32(
+ vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
+ out[1] = vreinterpretq_s16_s32(
+ vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
+ out[3] = vreinterpretq_s16_s32(
+ vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
+#endif
+}
+
+static INLINE void transpose_arrays_s16_8x4(const int16x8_t *const in,
+ int16x4_t *const out) {
+ // Swap 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+ const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
+ vreinterpretq_u32_s16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
+ vreinterpretq_u32_s16(b1.val[1]));
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+
+ out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
+ out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
+ out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
+ out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
+ out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
+ out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
+ out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
+ out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
+}
+
+#endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c
new file mode 100644
index 0000000000..9e4e8c0cf0
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_neon.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
+ assert(h <= 256);
+
+ int i = h;
+ do {
+ uint8x8_t s = load_unaligned_u8(src, src_stride);
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+ int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sum = horizontal_add_s16x8(sum_s16);
+ *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+}
+
+static INLINE void variance_8xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ // Number of rows we can process before 'sum_s16' overflows:
+ // 32767 / 255 ~= 128
+ assert(h <= 128);
+
+ int i = h;
+ do {
+ uint8x8_t s = vld1_u8(src);
+ uint8x8_t r = vld1_u8(ref);
+ int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+ sum_s16 = vaddq_s16(sum_s16, diff);
+
+ sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_s16x8(sum_s16);
+ *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_16xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ // Number of rows we can process before 'sum_s16' accumulators overflow:
+ // 32767 / 255 ~= 128, so 128 16-wide rows.
+ assert(h <= 128);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src);
+ uint8x16_t r = vld1q_u8(ref);
+
+ int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ *sum = horizontal_add_s16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+ *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_large_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int w, int h, int h_limit, uint32_t *sse,
+ int *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+ // accumulator overflows. After hitting this limit we accumulate into 32-bit
+ // elements.
+ int h_tmp = h > h_limit ? h_limit : h;
+
+ int i = 0;
+ do {
+ int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src + j);
+ uint8x16_t r = vld1q_u8(ref + j);
+
+ int16x8_t diff_l =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+ int16x8_t diff_h =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+ sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+ sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+ sse_s32[0] =
+ vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+ sse_s32[1] =
+ vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+ j += 16;
+ } while (j < w);
+
+ src += src_stride;
+ ref += ref_stride;
+ i++;
+ } while (i < h_tmp);
+
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+ sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+ h_tmp += h_limit;
+ } while (i < h);
+
+ *sum = horizontal_add_s32x4(sum_s32);
+ *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
+}
+
+static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ variance_large_neon(src, src_stride, ref, ref_stride, 128, h, 16, sse, sum);
+}
+
+#define VARIANCE_WXH_NEON(w, h, shift) \
+ unsigned int aom_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
+
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
+VARIANCE_WXH_NEON(4, 16, 6)
+
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
+VARIANCE_WXH_NEON(8, 32, 8)
+
+VARIANCE_WXH_NEON(16, 4, 6)
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+VARIANCE_WXH_NEON(16, 64, 10)
+
+VARIANCE_WXH_NEON(32, 8, 8)
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
+
+VARIANCE_WXH_NEON(64, 16, 10)
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
+VARIANCE_WXH_NEON(64, 128, 13)
+
+VARIANCE_WXH_NEON(128, 64, 13)
+VARIANCE_WXH_NEON(128, 128, 14)
+
+#undef VARIANCE_WXH_NEON
+
+// TODO(yunqingwang): Perform variance of two/four 8x8 blocks similar to that of
+// AVX2. Also, implement the NEON for variance computation present in this
+// function.
+void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ uint32_t *sse8x8, int *sum8x8,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8) {
+ // Loop over four 8x8 blocks. Process one 8x32 block.
+ for (int k = 0; k < 4; k++) {
+ variance_8xh_neon(src + (k * 8), src_stride, ref + (k * 8), ref_stride, 8,
+ &sse8x8[k], &sum8x8[k]);
+ }
+
+ *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+ *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+ for (int i = 0; i < 4; i++) {
+ var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+ }
+}
+
+void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ uint32_t *sse16x16,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var16x16) {
+ int sum16x16[2] = { 0 };
+ // Loop over two 16x16 blocks. Process one 16x32 block.
+ for (int k = 0; k < 2; k++) {
+ variance_16xh_neon(src + (k * 16), src_stride, ref + (k * 16), ref_stride,
+ 16, &sse16x16[k], &sum16x16[k]);
+ }
+
+ *tot_sse += sse16x16[0] + sse16x16[1];
+ *tot_sum += sum16x16[0] + sum16x16[1];
+ for (int i = 0; i < 2; i++) {
+ var16x16[i] =
+ sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+ }
+}
+
+static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int h) {
+ uint8x8_t s[2], r[2];
+ int16x4_t diff_lo[2], diff_hi[2];
+ uint16x8_t diff[2];
+ int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ int i = h;
+ do {
+ s[0] = vld1_u8(src);
+ src += src_stride;
+ s[1] = vld1_u8(src);
+ src += src_stride;
+ r[0] = vld1_u8(ref);
+ ref += ref_stride;
+ r[1] = vld1_u8(ref);
+ ref += ref_stride;
+
+ diff[0] = vsubl_u8(s[0], r[0]);
+ diff[1] = vsubl_u8(s[1], r[1]);
+
+ diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
+ diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
+ sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]);
+ sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]);
+
+ diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
+ diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
+ sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]);
+ sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]);
+
+ i -= 2;
+ } while (i != 0);
+
+ sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]);
+
+ *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+ return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+}
+
+static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int h) {
+ uint8x16_t s[2], r[2];
+ int16x4_t diff_lo[4], diff_hi[4];
+ uint16x8_t diff[4];
+ int32x4_t sse_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ int i = h;
+ do {
+ s[0] = vld1q_u8(src);
+ src += src_stride;
+ s[1] = vld1q_u8(src);
+ src += src_stride;
+ r[0] = vld1q_u8(ref);
+ ref += ref_stride;
+ r[1] = vld1q_u8(ref);
+ ref += ref_stride;
+
+ diff[0] = vsubl_u8(vget_low_u8(s[0]), vget_low_u8(r[0]));
+ diff[1] = vsubl_u8(vget_high_u8(s[0]), vget_high_u8(r[0]));
+ diff[2] = vsubl_u8(vget_low_u8(s[1]), vget_low_u8(r[1]));
+ diff[3] = vsubl_u8(vget_high_u8(s[1]), vget_high_u8(r[1]));
+
+ diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
+ diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
+ sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]);
+ sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]);
+
+ diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2]));
+ diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3]));
+ sse_s32[2] = vmlal_s16(sse_s32[2], diff_lo[2], diff_lo[2]);
+ sse_s32[3] = vmlal_s16(sse_s32[3], diff_lo[3], diff_lo[3]);
+
+ diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
+ diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
+ sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]);
+ sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]);
+
+ diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2]));
+ diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3]));
+ sse_s32[2] = vmlal_s16(sse_s32[2], diff_hi[2], diff_hi[2]);
+ sse_s32[3] = vmlal_s16(sse_s32[3], diff_hi[3], diff_hi[3]);
+
+ i -= 2;
+ } while (i != 0);
+
+ sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]);
+ sse_s32[2] = vaddq_s32(sse_s32[2], sse_s32[3]);
+ sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[2]);
+
+ *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+ return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+}
+
+#define MSE_WXH_NEON(w, h) \
+ unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ return mse##w##xh_neon(src, src_stride, ref, ref_stride, sse, h); \
+ }
+
+MSE_WXH_NEON(8, 8)
+MSE_WXH_NEON(8, 16)
+
+MSE_WXH_NEON(16, 8)
+MSE_WXH_NEON(16, 16)
+
+#undef MSE_WXH_NEON
+
+static INLINE uint64x2_t mse_accumulate_u16_u8_8x2(uint64x2_t sum,
+ uint16x8_t s0, uint16x8_t s1,
+ uint8x8_t d0, uint8x8_t d1) {
+ int16x8_t e0 = vreinterpretq_s16_u16(vsubw_u8(s0, d0));
+ int16x8_t e1 = vreinterpretq_s16_u16(vsubw_u8(s1, d1));
+
+ int32x4_t mse = vmull_s16(vget_low_s16(e0), vget_low_s16(e0));
+ mse = vmlal_s16(mse, vget_high_s16(e0), vget_high_s16(e0));
+ mse = vmlal_s16(mse, vget_low_s16(e1), vget_low_s16(e1));
+ mse = vmlal_s16(mse, vget_high_s16(e1), vget_high_s16(e1));
+
+ return vpadalq_u32(sum, vreinterpretq_u32_s32(mse));
+}
+
+static uint64x2_t mse_wxh_16bit(uint8_t *dst, int dstride, const uint16_t *src,
+ int sstride, int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ if (w == 8) {
+ do {
+ uint8x8_t d0 = vld1_u8(dst + 0 * dstride);
+ uint8x8_t d1 = vld1_u8(dst + 1 * dstride);
+ uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+ uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+ sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
+
+ dst += 2 * dstride;
+ src += 2 * sstride;
+ h -= 2;
+ } while (h != 0);
+ } else {
+ do {
+ uint8x8_t d0 = load_unaligned_u8_4x2(dst + 0 * dstride, dstride);
+ uint8x8_t d1 = load_unaligned_u8_4x2(dst + 2 * dstride, dstride);
+ uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+ uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+ sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
+
+ dst += 4 * dstride;
+ src += 4 * sstride;
+ h -= 4;
+ } while (h != 0);
+ }
+
+ return sum;
+}
+
+// Computes mse for a given block size. This function gets called for specific
+// block sizes, which are 8x8, 8x4, 4x8 and 4x4.
+uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h) {
+ return horizontal_add_u64x2(mse_wxh_16bit(dst, dstride, src, sstride, w, h));
+}
+
+uint32_t aom_get_mb_ss_neon(const int16_t *a) {
+ int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+ for (int i = 0; i < 256; i = i + 8) {
+ int16x8_t a_s16 = vld1q_s16(a + i);
+
+ sse[0] = vmlal_s16(sse[0], vget_low_s16(a_s16), vget_low_s16(a_s16));
+ sse[1] = vmlal_s16(sse[1], vget_high_s16(a_s16), vget_high_s16(a_s16));
+ }
+
+ return horizontal_add_s32x4(vaddq_s32(sse[0], sse[1]));
+}
+
+uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
+ int w, int h) {
+ uint64x2_t sum = vdupq_n_u64(0);
+
+ int num_blks = 16 / w;
+ do {
+ sum = vaddq_u64(sum, mse_wxh_16bit(dst, dstride, src, w, w, h));
+ dst += w;
+ src += w * h;
+ } while (--num_blks != 0);
+
+ return horizontal_add_u64x2(sum);
+}
diff --git a/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c b/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c
new file mode 100644
index 0000000000..9fb52e1df7
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void variance_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = load_unaligned_u8q(src, src_stride);
+ uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += 4 * src_stride;
+ ref += 4 * ref_stride;
+ i -= 4;
+ } while (i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int h, uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+ uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_16xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vld1q_u8(src);
+ uint8x16_t r = vld1q_u8(ref);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_large_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int w, int h,
+ uint32_t *sse, int *sum) {
+ uint32x4_t src_sum = vdupq_n_u32(0);
+ uint32x4_t ref_sum = vdupq_n_u32(0);
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(src + j);
+ uint8x16_t r = vld1q_u8(ref + j);
+
+ src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+ ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ j += 16;
+ } while (j < w);
+
+ src += src_stride;
+ ref += ref_stride;
+ } while (--i != 0);
+
+ int32x4_t sum_diff =
+ vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+ *sum = horizontal_add_s32x4(sum_diff);
+ *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse,
+ sum);
+}
+
+static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse,
+ sum);
+}
+
+static INLINE void variance_128xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride, int h,
+ uint32_t *sse, int *sum) {
+ variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 128, h, sse,
+ sum);
+}
+
+#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift) \
+ unsigned int aom_variance##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse, \
+ &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
+
+VARIANCE_WXH_NEON_DOTPROD(4, 4, 4)
+VARIANCE_WXH_NEON_DOTPROD(4, 8, 5)
+VARIANCE_WXH_NEON_DOTPROD(4, 16, 6)
+
+VARIANCE_WXH_NEON_DOTPROD(8, 4, 5)
+VARIANCE_WXH_NEON_DOTPROD(8, 8, 6)
+VARIANCE_WXH_NEON_DOTPROD(8, 16, 7)
+VARIANCE_WXH_NEON_DOTPROD(8, 32, 8)
+
+VARIANCE_WXH_NEON_DOTPROD(16, 4, 6)
+VARIANCE_WXH_NEON_DOTPROD(16, 8, 7)
+VARIANCE_WXH_NEON_DOTPROD(16, 16, 8)
+VARIANCE_WXH_NEON_DOTPROD(16, 32, 9)
+VARIANCE_WXH_NEON_DOTPROD(16, 64, 10)
+
+VARIANCE_WXH_NEON_DOTPROD(32, 8, 8)
+VARIANCE_WXH_NEON_DOTPROD(32, 16, 9)
+VARIANCE_WXH_NEON_DOTPROD(32, 32, 10)
+VARIANCE_WXH_NEON_DOTPROD(32, 64, 11)
+
+VARIANCE_WXH_NEON_DOTPROD(64, 16, 10)
+VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
+VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
+VARIANCE_WXH_NEON_DOTPROD(64, 128, 13)
+
+VARIANCE_WXH_NEON_DOTPROD(128, 64, 13)
+VARIANCE_WXH_NEON_DOTPROD(128, 128, 14)
+
+#undef VARIANCE_WXH_NEON_DOTPROD
+
+void aom_get_var_sse_sum_8x8_quad_neon_dotprod(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8) {
+ // Loop over four 8x8 blocks. Process one 8x32 block.
+ for (int k = 0; k < 4; k++) {
+ variance_8xh_neon_dotprod(src + (k * 8), src_stride, ref + (k * 8),
+ ref_stride, 8, &sse8x8[k], &sum8x8[k]);
+ }
+
+ *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+ *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+ for (int i = 0; i < 4; i++) {
+ var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+ }
+}
+
+void aom_get_var_sse_sum_16x16_dual_neon_dotprod(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var16x16) {
+ int sum16x16[2] = { 0 };
+ // Loop over two 16x16 blocks. Process one 16x32 block.
+ for (int k = 0; k < 2; k++) {
+ variance_16xh_neon_dotprod(src + (k * 16), src_stride, ref + (k * 16),
+ ref_stride, 16, &sse16x16[k], &sum16x16[k]);
+ }
+
+ *tot_sse += sse16x16[0] + sse16x16[1];
+ *tot_sum += sum16x16[0] + sum16x16[1];
+ for (int i = 0; i < 2; i++) {
+ var16x16[i] =
+ sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+ }
+}
+
+static INLINE unsigned int mse8xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride,
+ unsigned int *sse, int h) {
+ uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+ int i = h;
+ do {
+ uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+ uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+
+ sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sse = horizontal_add_u32x4(sse_u32);
+ return horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE unsigned int mse16xh_neon_dotprod(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride,
+ unsigned int *sse, int h) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s0 = vld1q_u8(src);
+ uint8x16_t s1 = vld1q_u8(src + src_stride);
+ uint8x16_t r0 = vld1q_u8(ref);
+ uint8x16_t r1 = vld1q_u8(ref + ref_stride);
+
+ uint8x16_t abs_diff0 = vabdq_u8(s0, r0);
+ uint8x16_t abs_diff1 = vabdq_u8(s1, r1);
+
+ sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0);
+ sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1);
+
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ i -= 2;
+ } while (i != 0);
+
+ *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+#define MSE_WXH_NEON_DOTPROD(w, h) \
+ unsigned int aom_mse##w##x##h##_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ return mse##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, sse, h); \
+ }
+
+MSE_WXH_NEON_DOTPROD(8, 8)
+MSE_WXH_NEON_DOTPROD(8, 16)
+
+MSE_WXH_NEON_DOTPROD(16, 8)
+MSE_WXH_NEON_DOTPROD(16, 16)
+
+#undef MSE_WXH_NEON_DOTPROD