diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:34:50 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:34:50 +0000 |
commit | def92d1b8e9d373e2f6f27c366d578d97d8960c6 (patch) | |
tree | 2ef34b9ad8bb9a9220e05d60352558b15f513894 /third_party/aom/av1/encoder/arm | |
parent | Adding debian version 125.0.3-1. (diff) | |
download | firefox-def92d1b8e9d373e2f6f27c366d578d97d8960c6.tar.xz firefox-def92d1b8e9d373e2f6f27c366d578d97d8960c6.zip |
Merging upstream version 126.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/aom/av1/encoder/arm')
3 files changed, 129 insertions, 23 deletions
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c index 63aad0b785..52803a9838 100644 --- a/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c +++ b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c @@ -14,7 +14,7 @@ #include "config/aom_config.h" #include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c index 5a52e701a2..919521fec7 100644 --- a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c +++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c @@ -23,7 +23,15 @@ #define SSE_STRIDE (BW + 4) // clang-format off +// Table used to pad the first and last columns and apply the sliding window. +DECLARE_ALIGNED(16, static const uint8_t, kLoadPad[4][16]) = { + { 2, 2, 2, 3, 4, 255, 255, 255, 255, 2, 2, 3, 4, 5, 255, 255 }, + { 255, 255, 2, 3, 4, 5, 6, 255, 255, 255, 255, 3, 4, 5, 6, 7 }, + { 0, 1, 2, 3, 4, 255, 255, 255, 255, 1, 2, 3, 4, 5, 255, 255 }, + { 255, 255, 2, 3, 4, 5, 5, 255, 255, 255, 255, 3, 4, 5, 5, 5 } +}; +// For columns that don't need to be padded it's just a simple mask. DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, @@ -56,22 +64,6 @@ static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1, } while (++i < block_height); } -static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col, - const uint32_t block_width) { - uint8x8_t s = vld1_u8(src); - - if (col == 0) { - const uint8_t lane2 = vget_lane_u8(s, 2); - s = vset_lane_u8(lane2, s, 0); - s = vset_lane_u8(lane2, s, 1); - } else if (col >= block_width - 4) { - const uint8_t lane5 = vget_lane_u8(s, 5); - s = vset_lane_u8(lane5, s, 6); - s = vset_lane_u8(lane5, s, 7); - } - return vcombine_u8(s, s); -} - static void apply_temporal_filter( const uint8_t *frame, const unsigned int stride, const uint32_t block_width, const uint32_t block_height, const int *subblock_mses, @@ -84,6 +76,10 @@ static void apply_temporal_filter( uint32_t acc_5x5_neon[BH][BW]; const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask); + const uint8x16_t pad_tbl0 = vld1q_u8(kLoadPad[0]); + const uint8x16_t pad_tbl1 = vld1q_u8(kLoadPad[1]); + const uint8x16_t pad_tbl2 = vld1q_u8(kLoadPad[2]); + const uint8x16_t pad_tbl3 = vld1q_u8(kLoadPad[3]); // Traverse 4 columns at a time - first and last two columns need padding. for (uint32_t col = 0; col < block_width; col += 4) { @@ -92,9 +88,18 @@ static void apply_temporal_filter( // Load, pad (for first and last two columns) and mask 3 rows from the top. for (int i = 2; i < 5; i++) { - const uint8x16_t s = load_and_pad(src, col, block_width); - vsrc[i][0] = vandq_u8(s, vmask.val[0]); - vsrc[i][1] = vandq_u8(s, vmask.val[1]); + uint8x8_t s = vld1_u8(src); + uint8x16_t s_dup = vcombine_u8(s, s); + if (col == 0) { + vsrc[i][0] = vqtbl1q_u8(s_dup, pad_tbl0); + vsrc[i][1] = vqtbl1q_u8(s_dup, pad_tbl1); + } else if (col >= block_width - 4) { + vsrc[i][0] = vqtbl1q_u8(s_dup, pad_tbl2); + vsrc[i][1] = vqtbl1q_u8(s_dup, pad_tbl3); + } else { + vsrc[i][0] = vandq_u8(s_dup, vmask.val[0]); + vsrc[i][1] = vandq_u8(s_dup, vmask.val[1]); + } src += SSE_STRIDE; } @@ -130,9 +135,18 @@ static void apply_temporal_filter( if (row <= block_height - 4) { // Load next row into the bottom of the sliding window. - uint8x16_t s = load_and_pad(src, col, block_width); - vsrc[4][0] = vandq_u8(s, vmask.val[0]); - vsrc[4][1] = vandq_u8(s, vmask.val[1]); + uint8x8_t s = vld1_u8(src); + uint8x16_t s_dup = vcombine_u8(s, s); + if (col == 0) { + vsrc[4][0] = vqtbl1q_u8(s_dup, pad_tbl0); + vsrc[4][1] = vqtbl1q_u8(s_dup, pad_tbl1); + } else if (col >= block_width - 4) { + vsrc[4][0] = vqtbl1q_u8(s_dup, pad_tbl2); + vsrc[4][1] = vqtbl1q_u8(s_dup, pad_tbl3); + } else { + vsrc[4][0] = vandq_u8(s_dup, vmask.val[0]); + vsrc[4][1] = vandq_u8(s_dup, vmask.val[1]); + } src += SSE_STRIDE; } else { // Pad the bottom 2 rows. diff --git a/third_party/aom/av1/encoder/arm/neon/wedge_utils_sve.c b/third_party/aom/av1/encoder/arm/neon/wedge_utils_sve.c new file mode 100644 index 0000000000..521601a3f3 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/wedge_utils_sve.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/reconinter.h" + +uint64_t av1_wedge_sse_from_residuals_sve(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + assert(N % 64 == 0); + + // Predicate pattern with first 8 elements true. + const svbool_t pattern = svptrue_pat_b16(SV_VL8); + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + int i = 0; + do { + int32x4_t sum[4]; + int16x8_t sum_s16[2]; + + const int16x8_t r1_l = vld1q_s16(r1 + i); + const int16x8_t r1_h = vld1q_s16(r1 + i + 8); + const int16x8_t d_l = vld1q_s16(d + i); + const int16x8_t d_h = vld1q_s16(d + i + 8); + + // Use a zero-extending load to widen the vector elements. + const int16x8_t m_l = svget_neonq_s16(svld1ub_s16(pattern, m + i)); + const int16x8_t m_h = svget_neonq_s16(svld1ub_s16(pattern, m + i + 8)); + + sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS); + sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS); + sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS); + sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS); + + sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l)); + sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l)); + sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h)); + sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h)); + + sum_s16[0] = vcombine_s16(vqmovn_s32(sum[0]), vqmovn_s32(sum[1])); + sum_s16[1] = vcombine_s16(vqmovn_s32(sum[2]), vqmovn_s32(sum[3])); + + sse[0] = aom_sdotq_s16(sse[0], sum_s16[0], sum_s16[0]); + sse[1] = aom_sdotq_s16(sse[1], sum_s16[1], sum_s16[1]); + + i += 16; + } while (i < N); + + const uint64_t csse = + (uint64_t)horizontal_add_s64x2(vaddq_s64(sse[0], sse[1])); + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +int8_t av1_wedge_sign_from_residuals_sve(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + assert(N % 16 == 0); + + // Predicate pattern with first 8 elements true. + svbool_t pattern = svptrue_pat_b16(SV_VL8); + int64x2_t acc_l = vdupq_n_s64(0); + int64x2_t acc_h = vdupq_n_s64(0); + + do { + const int16x8_t ds_l = vld1q_s16(ds); + const int16x8_t ds_h = vld1q_s16(ds + 8); + + // Use a zero-extending load to widen the vector elements. + const int16x8_t m_l = svget_neonq_s16(svld1ub_s16(pattern, m)); + const int16x8_t m_h = svget_neonq_s16(svld1ub_s16(pattern, m + 8)); + + acc_l = aom_sdotq_s16(acc_l, ds_l, m_l); + acc_h = aom_sdotq_s16(acc_h, ds_h, m_h); + + ds += 16; + m += 16; + N -= 16; + } while (N != 0); + + const int64x2_t sum = vaddq_s64(acc_l, acc_h); + return horizontal_add_s64x2(sum) > limit; +} |