summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c403
1 files changed, 403 insertions, 0 deletions
diff --git a/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c b/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c
new file mode 100644
index 0000000000..a6bd6d38e4
--- /dev/null
+++ b/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/convolve.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#define HBD_WIENER_5TAP_HORIZ(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve5_8_2d_h( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int16x8_t s04 = vaddq_s16(s0, s4); \
+ int16x8_t s13 = vaddq_s16(s1, s3); \
+ \
+ /* x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \
+ int32x4_t sum_lo = \
+ vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3); \
+ \
+ int32x4_t sum_hi = \
+ vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_5tap_horiz( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int width = w; \
+ \
+ do { \
+ int16x8_t s0, s1, s2, s3, s4; \
+ load_s16_8x5(s, 1, &s0, &s1, &s2, &s3, &s4); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve5_8_2d_h( \
+ s0, s1, s2, s3, s4, x_filter, round_vec, im_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += 8; \
+ d += 8; \
+ width -= 8; \
+ } while (width != 0); \
+ src_ptr += src_stride; \
+ dst_ptr += dst_stride; \
+ } while (--h != 0); \
+ }
+
+HBD_WIENER_5TAP_HORIZ(highbd, WIENER_ROUND0_BITS)
+HBD_WIENER_5TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
+
+#undef HBD_WIENER_5TAP_HORIZ
+
+#define HBD_WIENER_7TAP_HORIZ(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve7_8_2d_h( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \
+ const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, \
+ const uint16x8_t im_max_val) { \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int16x8_t s06 = vaddq_s16(s0, s6); \
+ int16x8_t s15 = vaddq_s16(s1, s5); \
+ int16x8_t s24 = vaddq_s16(s2, s4); \
+ \
+ int32x4_t sum_lo = \
+ vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); \
+ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); \
+ \
+ int32x4_t sum_hi = \
+ vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); \
+ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_7tap_horiz( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \
+ const int32x4_t round_vec, const uint16x8_t im_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int width = w; \
+ \
+ do { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6; \
+ load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve7_8_2d_h( \
+ s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, im_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += 8; \
+ d += 8; \
+ width -= 8; \
+ } while (width != 0); \
+ src_ptr += src_stride; \
+ dst_ptr += dst_stride; \
+ } while (--h != 0); \
+ }
+
+HBD_WIENER_7TAP_HORIZ(highbd, WIENER_ROUND0_BITS)
+HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2)
+
+#undef HBD_WIENER_7TAP_HORIZ
+
+#define HBD_WIENER_5TAP_VERT(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve5_8_2d_v( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter, \
+ const int32x4_t round_vec, const uint16x8_t res_max_val) { \
+ const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \
+ const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int32x4_t s04_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s4)); \
+ int32x4_t s13_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s3)); \
+ \
+ /* y_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \
+ int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s04_lo, y_filter_lo, 1); \
+ sum_lo = vmlaq_lane_s32(sum_lo, s13_lo, y_filter_hi, 0); \
+ sum_lo = \
+ vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s2)), y_filter_hi, 1); \
+ \
+ int32x4_t s04_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s4)); \
+ int32x4_t s13_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s3)); \
+ \
+ int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s04_hi, y_filter_lo, 1); \
+ sum_hi = vmlaq_lane_s32(sum_hi, s13_hi, y_filter_hi, 0); \
+ sum_hi = \
+ vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s2)), y_filter_hi, 1); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_5tap_vert( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \
+ const int32x4_t round_vec, const uint16x8_t res_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int height = h; \
+ \
+ while (height > 3) { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; \
+ load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \
+ s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \
+ uint16x8_t d1 = name##_wiener_convolve5_8_2d_v( \
+ s1, s2, s3, s4, s5, y_filter, round_vec, res_max_val); \
+ uint16x8_t d2 = name##_wiener_convolve5_8_2d_v( \
+ s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \
+ uint16x8_t d3 = name##_wiener_convolve5_8_2d_v( \
+ s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \
+ \
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \
+ \
+ s += 4 * src_stride; \
+ d += 4 * dst_stride; \
+ height -= 4; \
+ } \
+ \
+ while (height-- != 0) { \
+ int16x8_t s0, s1, s2, s3, s4; \
+ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \
+ s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += src_stride; \
+ d += dst_stride; \
+ } \
+ \
+ src_ptr += 8; \
+ dst_ptr += 8; \
+ w -= 8; \
+ } while (w != 0); \
+ }
+
+HBD_WIENER_5TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS)
+HBD_WIENER_5TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
+
+#undef HBD_WIENER_5TAP_VERT
+
+#define HBD_WIENER_7TAP_VERT(name, shift) \
+ static INLINE uint16x8_t name##_wiener_convolve7_8_2d_v( \
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \
+ const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec, \
+ const uint16x8_t res_max_val) { \
+ const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \
+ const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \
+ /* Wiener filter is symmetric so add mirrored source elements. */ \
+ int32x4_t s06_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s6)); \
+ int32x4_t s15_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s5)); \
+ int32x4_t s24_lo = vaddl_s16(vget_low_s16(s2), vget_low_s16(s4)); \
+ \
+ int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s06_lo, y_filter_lo, 0); \
+ sum_lo = vmlaq_lane_s32(sum_lo, s15_lo, y_filter_lo, 1); \
+ sum_lo = vmlaq_lane_s32(sum_lo, s24_lo, y_filter_hi, 0); \
+ sum_lo = \
+ vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s3)), y_filter_hi, 1); \
+ \
+ int32x4_t s06_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s6)); \
+ int32x4_t s15_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s5)); \
+ int32x4_t s24_hi = vaddl_s16(vget_high_s16(s2), vget_high_s16(s4)); \
+ \
+ int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s06_hi, y_filter_lo, 0); \
+ sum_hi = vmlaq_lane_s32(sum_hi, s15_hi, y_filter_lo, 1); \
+ sum_hi = vmlaq_lane_s32(sum_hi, s24_hi, y_filter_hi, 0); \
+ sum_hi = \
+ vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s3)), y_filter_hi, 1); \
+ \
+ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \
+ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \
+ \
+ return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \
+ } \
+ \
+ static INLINE void name##_convolve_add_src_7tap_vert( \
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \
+ const int32x4_t round_vec, const uint16x8_t res_max_val) { \
+ do { \
+ const int16_t *s = (int16_t *)src_ptr; \
+ uint16_t *d = dst_ptr; \
+ int height = h; \
+ \
+ while (height > 3) { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; \
+ load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, \
+ &s8, &s9); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \
+ s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \
+ uint16x8_t d1 = name##_wiener_convolve7_8_2d_v( \
+ s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \
+ uint16x8_t d2 = name##_wiener_convolve7_8_2d_v( \
+ s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec, res_max_val); \
+ uint16x8_t d3 = name##_wiener_convolve7_8_2d_v( \
+ s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec, res_max_val); \
+ \
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \
+ \
+ s += 4 * src_stride; \
+ d += 4 * dst_stride; \
+ height -= 4; \
+ } \
+ \
+ while (height-- != 0) { \
+ int16x8_t s0, s1, s2, s3, s4, s5, s6; \
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \
+ \
+ uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \
+ s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \
+ \
+ vst1q_u16(d, d0); \
+ \
+ s += src_stride; \
+ d += dst_stride; \
+ } \
+ \
+ src_ptr += 8; \
+ dst_ptr += 8; \
+ w -= 8; \
+ } while (w != 0); \
+ }
+
+HBD_WIENER_7TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS)
+HBD_WIENER_7TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2)
+
+#undef HBD_WIENER_7TAP_VERT
+
+static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) {
+ assert(filter[7] == 0);
+ if (filter[0] == 0 && filter[6] == 0) {
+ return WIENER_WIN_REDUCED;
+ }
+ return WIENER_WIN;
+}
+
+void av1_highbd_wiener_convolve_add_src_neon(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *x_filter, int x_step_q4,
+ const int16_t *y_filter, int y_step_q4, int w, int h,
+ const WienerConvolveParams *conv_params, int bd) {
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ assert(w % 8 == 0);
+ assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(x_filter[7] == 0 && y_filter[7] == 0);
+
+ DECLARE_ALIGNED(16, uint16_t,
+ im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]);
+
+ const int x_filter_taps = get_wiener_filter_taps(x_filter);
+ const int y_filter_taps = get_wiener_filter_taps(y_filter);
+ int16x4_t x_filter_s16 = vld1_s16(x_filter);
+ int16x4_t y_filter_s16 = vld1_s16(y_filter);
+ // Add 128 to tap 3. (Needed for rounding.)
+ x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48));
+ y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48));
+
+ const int im_stride = MAX_SB_SIZE;
+ const int im_h = h + y_filter_taps - 1;
+ const int horiz_offset = x_filter_taps / 2;
+ const int vert_offset = (y_filter_taps / 2) * (int)src_stride;
+
+ const int extraprec_clamp_limit =
+ WIENER_CLAMP_LIMIT(conv_params->round_0, bd);
+ const uint16x8_t im_max_val = vdupq_n_u16(extraprec_clamp_limit - 1);
+ const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1));
+
+ const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1);
+ const int32x4_t vert_round_vec =
+ vdupq_n_s32(-(1 << (bd + conv_params->round_1 - 1)));
+
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ if (bd == 12) {
+ if (x_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_12_convolve_add_src_5tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ highbd_12_convolve_add_src_7tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ }
+
+ if (y_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_12_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ } else {
+ highbd_12_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride,
+ w, h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ }
+
+ } else {
+ if (x_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_convolve_add_src_5tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ } else {
+ highbd_convolve_add_src_7tap_horiz(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, x_filter_s16, horiz_round_vec, im_max_val);
+ }
+
+ if (y_filter_taps == WIENER_WIN_REDUCED) {
+ highbd_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride, w,
+ h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ } else {
+ highbd_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride, w,
+ h, y_filter_s16, vert_round_vec,
+ res_max_val);
+ }
+ }
+}