summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-12 05:43:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-12 05:43:14 +0000
commit8dd16259287f58f9273002717ec4d27e97127719 (patch)
tree3863e62a53829a84037444beab3abd4ed9dfc7d0 /third_party/aom/av1
parentReleasing progress-linux version 126.0.1-1~progress7.99u1. (diff)
downloadfirefox-8dd16259287f58f9273002717ec4d27e97127719.tar.xz
firefox-8dd16259287f58f9273002717ec4d27e97127719.zip
Merging upstream version 127.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/aom/av1')
-rw-r--r--third_party/aom/av1/av1.cmake2
-rw-r--r--third_party/aom/av1/av1_cx_iface.c1
-rw-r--r--third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c55
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon_dotprod.c49
-rw-r--r--third_party/aom/av1/common/av1_rtcd_defs.pl7
-rw-r--r--third_party/aom/av1/common/resize.c58
-rw-r--r--third_party/aom/av1/common/resize.h10
-rw-r--r--third_party/aom/av1/common/x86/resize_avx2.c411
-rw-r--r--third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c5
-rw-r--r--third_party/aom/av1/encoder/arm/neon/pickrst_sve.c590
-rw-r--r--third_party/aom/av1/encoder/enc_enums.h4
-rw-r--r--third_party/aom/av1/encoder/encodeframe.c4
-rw-r--r--third_party/aom/av1/encoder/encoder.h2
-rw-r--r--third_party/aom/av1/encoder/ethread.c7
-rw-r--r--third_party/aom/av1/encoder/global_motion.h7
-rw-r--r--third_party/aom/av1/encoder/nonrd_pickmode.c34
-rw-r--r--third_party/aom/av1/encoder/partition_search.c20
-rw-r--r--third_party/aom/av1/encoder/picklpf.c2
-rw-r--r--third_party/aom/av1/encoder/pickrst.c21
-rw-r--r--third_party/aom/av1/encoder/speed_features.c2
-rw-r--r--third_party/aom/av1/encoder/tune_vmaf.c4
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_avx2.c12
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_sse4.c18
23 files changed, 1210 insertions, 115 deletions
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
index 32645f6065..b6cf974aa7 100644
--- a/third_party/aom/av1/av1.cmake
+++ b/third_party/aom/av1/av1.cmake
@@ -302,6 +302,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
"${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
"${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
"${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/resize_avx2.c"
"${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
"${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c"
"${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
@@ -375,6 +376,7 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD
list(APPEND AOM_AV1_ENCODER_INTRIN_SVE
"${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c"
+ "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_sve.c"
"${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_sve.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
index 2b6b1504e6..39c03c9ecb 100644
--- a/third_party/aom/av1/av1_cx_iface.c
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -32,6 +32,7 @@
#include "av1/common/enums.h"
#include "av1/common/scale.h"
#include "av1/encoder/bitstream.h"
+#include "av1/encoder/enc_enums.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/encoder_alloc.h"
#include "av1/encoder/encoder_utils.h"
diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c
index 3aeffbb0e6..40befdf44e 100644
--- a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c
+++ b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c
@@ -80,17 +80,15 @@ static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod(
const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
const int16_t *x_filter_ptr, const int im_h, int w) {
const int bd = 8;
- const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2));
// Dot product constants and other shims.
const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
- const int32_t correction_s32 =
- vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
- // Fold horiz_const into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
- const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const +
- (1 << ((ROUND0_BITS - 1) - 1)));
+ // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+ // - which are generally faster than rounding shifts on modern CPUs.
+ const int32_t horiz_const =
+ ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+ // Halve the total because we will halve the filter values.
+ const int32x4_t correction =
+ vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
const uint8x16_t range_limit = vdupq_n_u8(128);
const uint8_t *src_ptr = src;
@@ -334,15 +332,14 @@ static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod(
// Dot-product constants and other shims.
const uint8x16_t range_limit = vdupq_n_u8(128);
- const int32_t correction_s32 =
- vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
// Fold round_offset into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // Halve the total because we will halve the filter values.
int32x4_t correction =
- vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
+ vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) +
+ (1 << (ROUND0_BITS - 1))) /
+ 2);
const int horiz_offset = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - horiz_offset;
@@ -455,15 +452,14 @@ static INLINE void dist_wtd_convolve_x_avg_neon_dotprod(
// Dot-product constants and other shims.
const uint8x16_t range_limit = vdupq_n_u8(128);
- const int32_t correction_s32 =
- vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
// Fold round_offset into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // Halve the total because we will halve the filter values.
int32x4_t correction =
- vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
+ vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) +
+ (1 << (ROUND0_BITS - 1))) /
+ 2);
const int horiz_offset = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - horiz_offset;
@@ -574,15 +570,14 @@ static INLINE void dist_wtd_convolve_x_neon_dotprod(
// Dot-product constants and other shims.
const uint8x16_t range_limit = vdupq_n_u8(128);
- const int32_t correction_s32 =
- vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
// Fold round_offset into the dot-product filter correction constant. The
- // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-
- // rounding shifts - which are generally faster than rounding shifts on
- // modern CPUs. (The extra -1 is needed because we halved the filter values.)
+ // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // Halve the total because we will halve the vilter values.
int32x4_t correction =
- vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) +
- (1 << ((ROUND0_BITS - 1) - 1)));
+ vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) +
+ (1 << (ROUND0_BITS - 1))) /
+ 2);
const int horiz_offset = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - horiz_offset;
diff --git a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c
index c29229eb09..132da2442b 100644
--- a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c
+++ b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c
@@ -102,14 +102,12 @@ static INLINE void convolve_x_sr_12tap_neon_dotprod(
const int8x16_t filter =
vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
- const int32_t correction_s32 =
- vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)),
- vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS))));
- // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right
- // shift by FILTER_BITS - instead of a first rounding right shift by
+ // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+ // right shift by FILTER_BITS - instead of a first rounding right shift by
// ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
// ROUND0_BITS.
- int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1)));
+ int32x4_t correction =
+ vdupq_n_s32((128 << FILTER_BITS) + (1 << (ROUND0_BITS - 1)));
const uint8x16_t range_limit = vdupq_n_u8(128);
const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
@@ -274,16 +272,13 @@ void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
}
const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
- // Dot product constants.
- const int32_t correction_s32 =
- vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
- // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single
- // rounding right shift by FILTER_BITS - instead of a first rounding right
- // shift by ROUND0_BITS, followed by second rounding right shift by
- // FILTER_BITS - ROUND0_BITS.
- // The outermost -1 is needed because we will halve the filter values.
+ // Dot product constants:
+ // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+ // right shift by FILTER_BITS - instead of a first rounding right shift by
+ // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+ // ROUND0_BITS. Halve the total because we will halve the filter values.
const int32x4_t correction =
- vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1)));
+ vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
const uint8x16_t range_limit = vdupq_n_u8(128);
if (w <= 4) {
@@ -465,16 +460,13 @@ static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod(
const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
vmovn_s16(x_filter_s16.val[1]));
- // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
- // - which are generally faster than rounding shifts on modern CPUs.
+ // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
const int32_t horiz_const =
((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
// Dot product constants.
- const int32x4_t correct_tmp =
- vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)),
- vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7)));
const int32x4_t correction =
- vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const);
+ vdupq_n_s32((128 << FILTER_BITS) + horiz_const);
const uint8x16_t range_limit = vdupq_n_u8(128);
const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
@@ -621,16 +613,15 @@ static INLINE void convolve_2d_sr_horiz_neon_dotprod(
const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
int im_h, const int16_t *x_filter_ptr) {
const int bd = 8;
- // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
- // shifts - which are generally faster than rounding shifts on modern CPUs.
- // The outermost -1 is needed because we halved the filter values.
- const int32_t horiz_const =
- ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1)));
// Dot product constants.
const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
- const int32_t correction_s32 =
- vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1));
- const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const);
+ // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ const int32_t horiz_const =
+ ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+ // Halve the total because we will halve the filter values.
+ const int32x4_t correction =
+ vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
const uint8x16_t range_limit = vdupq_n_u8(128);
const uint8_t *src_ptr = src;
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
index c0831330d1..6a0043c761 100644
--- a/third_party/aom/av1/common/av1_rtcd_defs.pl
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -458,7 +458,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats";
- specialize qw/av1_compute_stats sse4_1 avx2 neon/;
+ specialize qw/av1_compute_stats sse4_1 avx2 neon sve/;
add_proto qw/void av1_calc_proj_params/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
specialize qw/av1_calc_proj_params sse4_1 avx2 neon/;
add_proto qw/int64_t av1_lowbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
@@ -469,7 +469,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2 neon/;
add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/;
- add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
+ add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/;
}
}
@@ -554,6 +554,9 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/;
}
+add_proto qw/bool resize_vert_dir/, "uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col";
+specialize qw/resize_vert_dir avx2/;
+
add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
specialize qw/av1_warp_affine sse4_1 avx2 neon neon_i8mm sve/;
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
index 441323ab1f..2b48b9fff4 100644
--- a/third_party/aom/av1/common/resize.c
+++ b/third_party/aom/av1/common/resize.c
@@ -18,6 +18,7 @@
#include <string.h>
#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/flow_estimation/corner_detect.h"
@@ -216,10 +217,6 @@ const int16_t av1_resize_filter_normative[(
// Filters for interpolation (full-band) - no filtering for integer pixels
#define filteredinterp_filters1000 av1_resize_filter_normative
-// Filters for factor of 2 downsampling.
-static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
-static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
-
static const InterpKernel *choose_interp_filter(int in_length, int out_length) {
int out_length16 = out_length * 16;
if (out_length16 >= in_length * 16)
@@ -524,6 +521,59 @@ static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
}
}
+bool resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride,
+ int height, int height2, int width2, int start_col) {
+ bool mem_status = true;
+ uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height);
+ uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2);
+ if (arrbuf == NULL || arrbuf2 == NULL) {
+ mem_status = false;
+ goto Error;
+ }
+
+ for (int i = start_col; i < width2; ++i) {
+ fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+ down2_symeven(arrbuf, height, arrbuf2);
+ fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+ }
+
+Error:
+ aom_free(arrbuf);
+ aom_free(arrbuf2);
+ return mem_status;
+}
+
+void resize_horz_dir(const uint8_t *const input, int in_stride, uint8_t *intbuf,
+ int height, int filtered_length, int width2) {
+ for (int i = 0; i < height; ++i)
+ down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i);
+}
+
+bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2,
+ int width2, int out_stride) {
+ uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(*intbuf) * width2 * height);
+ if (intbuf == NULL) {
+ return false;
+ }
+
+ // Resize in the horizontal direction
+ resize_horz_dir(input, in_stride, intbuf, height, width, width2);
+ // Resize in the vertical direction
+ bool mem_status = resize_vert_dir(intbuf, output, out_stride, height, height2,
+ width2, 0 /*start_col*/);
+ aom_free(intbuf);
+ return mem_status;
+}
+
+// Check if both the output width and height are half of input width and
+// height respectively.
+bool should_resize_by_half(int height, int width, int height2, int width2) {
+ const bool is_width_by_2 = get_down2_length(width, 1) == width2;
+ const bool is_height_by_2 = get_down2_length(height, 1) == height2;
+ return (is_width_by_2 && is_height_by_2);
+}
+
bool av1_resize_plane(const uint8_t *input, int height, int width,
int in_stride, uint8_t *output, int height2, int width2,
int out_stride) {
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
index d573a538bf..de71f5d539 100644
--- a/third_party/aom/av1/common/resize.h
+++ b/third_party/aom/av1/common/resize.h
@@ -20,6 +20,10 @@
extern "C" {
#endif
+// Filters for factor of 2 downsampling.
+static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
+static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
+
bool av1_resize_plane(const uint8_t *input, int height, int width,
int in_stride, uint8_t *output, int height2, int width2,
int out_stride);
@@ -93,6 +97,12 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom);
void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool,
bool alloc_pyramid);
+bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2,
+ int width2, int out_stride);
+
+bool should_resize_by_half(int height, int width, int height2, int width2);
+
// Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
// Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
diff --git a/third_party/aom/av1/common/x86/resize_avx2.c b/third_party/aom/av1/common/x86/resize_avx2.c
new file mode 100644
index 0000000000..c44edb88d9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/resize_avx2.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/resize.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#define CAST_HI(x) _mm256_castsi128_si256(x)
+#define CAST_LOW(x) _mm256_castsi256_si128(x)
+
+#define PROCESS_RESIZE_Y_WD16 \
+ const int idx1 = AOMMIN(height - 1, i + 5); \
+ const int idx2 = AOMMIN(height - 1, i + 6); \
+ l6 = l10; \
+ l7 = l11; \
+ l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride)); \
+ l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride)); \
+ \
+ /* g0... g15 | i0... i15 */ \
+ const __m256i s68 = \
+ _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20); \
+ /* h0... h15 | j0... j15 */ \
+ const __m256i s79 = \
+ _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20); \
+ \
+ /* g0h0... g7g7 | i0j0... i7j */ \
+ s[3] = _mm256_unpacklo_epi8(s68, s79); \
+ /* g8h8... g15g15 | i8j8... i15j15 */ \
+ s[8] = _mm256_unpackhi_epi8(s68, s79); \
+ \
+ __m256i res_out[2] = { 0 }; \
+ resize_y_convolve(s, coeffs_y, res_out); \
+ \
+ /* r00... r07 */ \
+ __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
+ /* r20... r27 */ \
+ __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
+ \
+ res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \
+ res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \
+ \
+ __m256i res_out_b[2] = { 0 }; \
+ resize_y_convolve(s + 5, coeffs_y, res_out_b); \
+ \
+ /* r08... r015 */ \
+ __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
+ /* r28... r215 */ \
+ __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \
+ res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits); \
+ res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits); \
+ \
+ /* r00... r03 r20... r23 | r04... r07 r24... r27 */ \
+ __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \
+ /* r08... r012 r28... r212 | r013... r015 r213... r215 */ \
+ __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2); \
+ /* r00... r07 | r20... r27 */ \
+ res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8); \
+ /* r08... r015 | r28... r215 */ \
+ res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8); \
+ /* r00... r015 | r20... r215 */ \
+ res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1); \
+ res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel); \
+ res_8bit0 = _mm256_max_epu8(res_8bit0, zero);
+
+#define PROCESS_RESIZE_Y_WD8 \
+ const int idx1 = AOMMIN(height - 1, i + 5); \
+ const int idx2 = AOMMIN(height - 1, i + 6); \
+ l6 = l10; \
+ l7 = l11; \
+ l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride)); \
+ l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride)); \
+ \
+ /* g0h0... g7h7 */ \
+ s67 = _mm_unpacklo_epi8(l6, l7); \
+ /* i0j0...i7j7 */ \
+ __m128i s89 = _mm_unpacklo_epi8(l8, l9); \
+ \
+ /* g0h0...g7g7 | i0j0...i7j7 */ \
+ s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20); \
+ \
+ __m256i res_out[2] = { 0 }; \
+ resize_y_convolve(s, coeffs_y, res_out); \
+ \
+ /* r00... r07 */ \
+ __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
+ /* r20...r27 */ \
+ __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
+ res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \
+ res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \
+ \
+ /* r00...r03 r20...r23 | r04...r07 r24...r27 */ \
+ res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \
+ /* r00...r07 | r20...r27 */ \
+ res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8); \
+ res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1); \
+ res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel); \
+ res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);
+
+static INLINE void resize_y_convolve(const __m256i *const s,
+ const __m256i *const coeffs,
+ __m256i *res_out) {
+ const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+ const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+ const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+ const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+ const __m256i dst_0 = _mm256_add_epi16(res_0, res_1);
+ const __m256i dst_1 = _mm256_add_epi16(res_2, res_3);
+ // The sum of convolve operation crosses signed 16bit. Hence, the addition
+ // should happen in 32bit.
+ const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0));
+ const __m256i dst_01 =
+ _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1));
+ const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1));
+ const __m256i dst_11 =
+ _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1));
+
+ res_out[0] = _mm256_add_epi32(dst_00, dst_10);
+ res_out[1] = _mm256_add_epi32(dst_01, dst_11);
+}
+
+static INLINE void prepare_filter_coeffs(const int16_t *filter,
+ __m256i *const coeffs /* [4] */) {
+ // f0 f1 f2 f3 x x x x
+ const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
+ // f0 f1 f2 f3 f0 f1 f2 f3
+ const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44);
+ // f0 f1 f2 f3 f1 f0 f3 f2
+ const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1);
+
+ const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1);
+
+ // f0 f1 f0 f1 ..
+ coeffs[2] = _mm256_broadcastw_epi16(filter_8bit);
+ // f2 f3 f2 f3 ..
+ coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2));
+ // f3 f2 f3 f2 ..
+ coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6));
+ // f1 f0 f1 f0 ..
+ coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
+}
+
+bool resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
+ int height, int height2, int stride, int start_col) {
+ assert(start_col <= stride);
+ // For the GM tool, the input layer height or width is assured to be an even
+ // number. Hence the function 'down2_symodd()' is not invoked and SIMD
+ // optimization of the same is not implemented.
+ // When the input height is less than 8 and even, the potential input
+ // heights are limited to 2, 4, or 6. These scenarios require seperate
+ // handling due to padding requirements. Invoking the C function here will
+ // eliminate the need for conditional statements within the subsequent SIMD
+ // code to manage these cases.
+ if (height & 1 || height < 8) {
+ return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+ stride, start_col);
+ }
+
+ __m256i s[10], coeffs_y[4];
+ const int bits = FILTER_BITS;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const uint8_t max_pixel = 255;
+ const __m256i clip_pixel = _mm256_set1_epi8(max_pixel);
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
+
+ const int num_col16 = stride / 16;
+ int remain_col = stride % 16;
+ // The core vertical SIMD processes 4 input rows simultaneously to generate
+ // output corresponding to 2 rows. To streamline the core loop and eliminate
+ // the need for conditional checks, the remaining rows (4 or 6) are processed
+ // separately.
+ const int remain_row = (height % 4 == 0) ? 4 : 6;
+
+ for (int j = start_col; j < stride - remain_col; j += 16) {
+ const uint8_t *data = &intbuf[j];
+ const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride));
+ // Padding top 3 rows with the last available row at the top.
+ const __m128i l0 = l3;
+ const __m128i l1 = l3;
+ const __m128i l2 = l3;
+ const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride));
+
+ __m128i l6, l7, l8, l9;
+ __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride));
+ __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride));
+ __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride));
+
+ // a0...a15 | c0...c15
+ const __m256i s02 =
+ _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20);
+ // b0...b15 | d0...d15
+ const __m256i s13 =
+ _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20);
+ // c0...c15 | e0...e15
+ const __m256i s24 =
+ _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20);
+ // d0...d15 | f0...f15
+ const __m256i s35 =
+ _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20);
+ // e0...e15 | g0...g15
+ const __m256i s46 =
+ _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20);
+ // f0...f15 | h0...h15
+ const __m256i s57 =
+ _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20);
+
+ // a0b0...a7b7 | c0d0...c7d7
+ s[0] = _mm256_unpacklo_epi8(s02, s13);
+ // c0d0...c7d7 | e0f0...e7f7
+ s[1] = _mm256_unpacklo_epi8(s24, s35);
+ // e0f0...e7f7 | g0h0...g7h7
+ s[2] = _mm256_unpacklo_epi8(s46, s57);
+
+ // a8b8...a15b15 | c8d8...c15d15
+ s[5] = _mm256_unpackhi_epi8(s02, s13);
+ // c8d8...c15d15 | e8f8...e15f15
+ s[6] = _mm256_unpackhi_epi8(s24, s35);
+ // e8f8...e15f15 | g8h8...g15h15
+ s[7] = _mm256_unpackhi_epi8(s46, s57);
+
+ // height to be processed here
+ const int process_ht = height - remain_row;
+ for (int i = 0; i < process_ht; i += 4) {
+ PROCESS_RESIZE_Y_WD16
+
+ _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
+ CAST_LOW(res_8bit0));
+
+ _mm_storeu_si128(
+ (__m128i *)&output[(i / 2) * out_stride + j + out_stride],
+ _mm256_extracti128_si256(res_8bit0, 1));
+
+ // Load the required data for processing of next 4 input rows.
+ const int idx7 = AOMMIN(height - 1, i + 7);
+ const int idx8 = AOMMIN(height - 1, i + 8);
+ l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride));
+ l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride));
+
+ const __m256i s810 =
+ _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
+ const __m256i s911 =
+ _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
+ // i0j0... i7j7 | k0l0... k7l7
+ s[4] = _mm256_unpacklo_epi8(s810, s911);
+ // i8j8... i15j15 | k8l8... k15l15
+ s[9] = _mm256_unpackhi_epi8(s810, s911);
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+
+ s[5] = s[7];
+ s[6] = s[8];
+ s[7] = s[9];
+ }
+
+ // Process the remaining last 4 or 6 rows here.
+ int i = process_ht;
+ while (i < height - 1) {
+ PROCESS_RESIZE_Y_WD16
+
+ _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
+ CAST_LOW(res_8bit0));
+ i += 2;
+
+ const int is_store_valid = (i < height - 1);
+ if (is_store_valid)
+ _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
+ _mm256_extracti128_si256(res_8bit0, 1));
+ i += 2;
+
+ // Check if there is any remaining height to process. If so, perform the
+ // necessary data loading for processing the next row.
+ if (i < height - 1) {
+ l10 = l11 = l9;
+ const __m256i s810 =
+ _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
+ const __m256i s911 =
+ _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
+ // i0j0... i7j7 | k0l0... k7l7
+ s[4] = _mm256_unpacklo_epi8(s810, s911);
+ // i8j8... i15j15 | k8l8... k15l15
+ s[9] = _mm256_unpackhi_epi8(s810, s911);
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+
+ s[5] = s[7];
+ s[6] = s[8];
+ s[7] = s[9];
+ }
+ }
+ }
+
+ if (remain_col > 7) {
+ const int processed_wd = num_col16 * 16;
+ remain_col = stride % 8;
+
+ const uint8_t *data = &intbuf[processed_wd];
+
+ const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
+ // Padding top 3 rows with available top-most row.
+ const __m128i l0 = l3;
+ const __m128i l1 = l3;
+ const __m128i l2 = l3;
+ const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
+
+ __m128i l6, l7, l8, l9;
+ __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
+ __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride));
+ __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride));
+
+ // a0b0...a7b7
+ const __m128i s01 = _mm_unpacklo_epi8(l0, l1);
+ // c0d0...c7d7
+ const __m128i s23 = _mm_unpacklo_epi8(l2, l3);
+ // e0f0...e7f7
+ const __m128i s45 = _mm_unpacklo_epi8(l4, l5);
+ // g0h0...g7h7
+ __m128i s67 = _mm_unpacklo_epi8(l10, l11);
+
+ // a0b0...a7b7 | c0d0...c7d7
+ s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20);
+ // c0d0...c7d7 | e0f0...e7f7
+ s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20);
+ // e0f0...e7f7 | g0h0...g7h7
+ s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20);
+
+ // height to be processed here
+ const int process_ht = height - remain_row;
+ for (int i = 0; i < process_ht; i += 4) {
+ PROCESS_RESIZE_Y_WD8
+
+ _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
+ CAST_LOW(res_a_round_1));
+
+ _mm_storel_epi64(
+ (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride],
+ _mm256_extracti128_si256(res_a_round_1, 1));
+
+ const int idx7 = AOMMIN(height - 1, i + 7);
+ const int idx8 = AOMMIN(height - 1, i + 8);
+ l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride));
+ l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride));
+
+ // k0l0... k7l7
+ const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
+ // i0j0... i7j7 | k0l0... k7l7
+ s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ }
+
+ // Process the remaining last 4 or 6 rows here.
+ int i = process_ht;
+ while (i < height - 1) {
+ PROCESS_RESIZE_Y_WD8
+
+ _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
+ CAST_LOW(res_a_round_1));
+
+ i += 2;
+
+ const int is_store_valid = (i < height - 1);
+ if (is_store_valid)
+ _mm_storel_epi64(
+ (__m128i *)&output[(i / 2) * out_stride + processed_wd],
+ _mm256_extracti128_si256(res_a_round_1, 1));
+ i += 2;
+
+ // Check rows are still remaining for processing. If yes do the required
+ // load of data for the next iteration.
+ if (i < height - 1) {
+ l10 = l11 = l9;
+ // k0l0... k7l7
+ const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
+ // i0j0... i7j7 | k0l0... k7l7
+ s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ }
+ }
+ }
+
+ if (remain_col)
+ return resize_vert_dir_c(intbuf, output, out_stride, height, height2,
+ stride, stride - remain_col);
+
+ return true;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
index 47b5f5cfb7..8b0d3bcc7e 100644
--- a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
@@ -1008,10 +1008,13 @@ static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride,
}
void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8,
- const uint8_t *src8, int h_start, int h_end,
+ const uint8_t *src8, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
int v_start, int v_end, int dgd_stride,
int src_stride, int64_t *M, int64_t *H,
aom_bit_depth_t bit_depth) {
+ (void)dgd_avg;
+ (void)src_avg;
assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED);
const int wiener_halfwin = wiener_win >> 1;
diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_sve.c b/third_party/aom/av1/encoder/arm/neon/pickrst_sve.c
new file mode 100644
index 0000000000..a519ecc5f5
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/pickrst_sve.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/aom_neon_sve_bridge.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE uint8_t find_average_sve(const uint8_t *src, int src_stride,
+ int width, int height) {
+ uint32x4_t avg_u32 = vdupq_n_u32(0);
+ uint8x16_t ones = vdupq_n_u8(1);
+
+ // Use a predicate to compute the last columns.
+ svbool_t pattern = svwhilelt_b8_u32(0, width % 16);
+
+ int h = height;
+ do {
+ int j = width;
+ const uint8_t *src_ptr = src;
+ while (j >= 16) {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ avg_u32 = vdotq_u32(avg_u32, s, ones);
+
+ j -= 16;
+ src_ptr += 16;
+ }
+ uint8x16_t s_end = svget_neonq_u8(svld1_u8(pattern, src_ptr));
+ avg_u32 = vdotq_u32(avg_u32, s_end, ones);
+
+ src += src_stride;
+ } while (--h != 0);
+ return (uint8_t)(vaddlvq_u32(avg_u32) / (width * height));
+}
+
+static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
+ int16_t *buf_avg, int buf_avg_stride,
+ int width, int height,
+ int downsample_factor) {
+ uint8x8_t avg_u8 = vdup_n_u8(avg);
+
+ // Use a predicate to compute the last columns.
+ svbool_t pattern = svwhilelt_b8_u32(0, width % 8);
+
+ uint8x8_t avg_end = vget_low_u8(svget_neonq_u8(svdup_n_u8_z(pattern, avg)));
+
+ do {
+ int j = width;
+ const uint8_t *buf_ptr = buf;
+ int16_t *buf_avg_ptr = buf_avg;
+ while (j >= 8) {
+ uint8x8_t d = vld1_u8(buf_ptr);
+ vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8)));
+
+ j -= 8;
+ buf_ptr += 8;
+ buf_avg_ptr += 8;
+ }
+ uint8x8_t d_end = vget_low_u8(svget_neonq_u8(svld1_u8(pattern, buf_ptr)));
+ vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d_end, avg_end)));
+
+ buf += buf_stride;
+ buf_avg += buf_avg_stride;
+ height -= downsample_factor;
+ } while (height > 0);
+}
+
+static INLINE void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
+ const int wiener_win2, const int scale) {
+ for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
+ // Transpose the first 2x2 square. It needs a special case as the element
+ // of the bottom left is on the diagonal.
+ int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1);
+ int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1);
+
+ int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
+
+ vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
+ vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row);
+
+ // Transpose and store all the remaining 2x2 squares of the line.
+ for (int j = i + 3; j < wiener_win2; j = j + 2) {
+ row0 = vld1q_s64(H_tmp + i * wiener_win2 + j);
+ row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j);
+
+ int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1);
+ int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1);
+
+ vst1q_s64(H_tmp + j * wiener_win2 + i, tr_row0);
+ vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1);
+ }
+ }
+ for (int i = 0; i < wiener_win2 * wiener_win2; i++) {
+ H[i] += H_tmp[i] * scale;
+ }
+}
+
+// Transpose the matrix that has just been computed and accumulate it in M.
+static INLINE void acc_transpose_M(int64_t *M, const int64_t *M_trn,
+ const int wiener_win, int scale) {
+ for (int i = 0; i < wiener_win; ++i) {
+ for (int j = 0; j < wiener_win; ++j) {
+ int tr_idx = j * wiener_win + i;
+ *M++ += (int64_t)(M_trn[tr_idx] * scale);
+ }
+ }
+}
+
+// Swap each half of the dgd vectors so that we can accumulate the result of
+// the dot-products directly in the destination matrix.
+static INLINE int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
+ int16x8_t dgd_trn0 = vreinterpretq_s16_s64(
+ vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
+ int16x8_t dgd_trn1 = vreinterpretq_s16_s64(
+ vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
+
+ return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 };
+}
+
+static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
+ int64_t *M, int row) {
+ const int wiener_win = 5;
+
+ int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
+ int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
+
+ int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
+ cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
+ vst1q_s64(M + row * wiener_win + 0, cross_corr01);
+
+ int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
+ int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
+
+ int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
+ cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
+ vst1q_s64(M + row * wiener_win + 2, cross_corr23);
+
+ int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]);
+ M[row * wiener_win + 4] += vaddvq_s64(m4);
+}
+
+static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
+ int64_t *M, int row) {
+ const int wiener_win = 7;
+
+ int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
+ int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
+
+ int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
+ cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
+ vst1q_s64(M + row * wiener_win + 0, cross_corr01);
+
+ int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
+ int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
+
+ int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
+ cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
+ vst1q_s64(M + row * wiener_win + 2, cross_corr23);
+
+ int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4);
+ int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]);
+
+ int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0);
+ cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1);
+ vst1q_s64(M + row * wiener_win + 4, cross_corr45);
+
+ int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]);
+ M[row * wiener_win + 6] += vaddvq_s64(m6);
+}
+
+static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
+ const int wiener_win,
+ const int wiener_win2) {
+ for (int row0 = 0; row0 < wiener_win; row0++) {
+ for (int row1 = row0; row1 < wiener_win; row1++) {
+ int auto_cov_idx =
+ (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
+
+ int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]);
+ H[auto_cov_idx] += vaddvq_s64(auto_cov);
+ }
+ }
+}
+
+static INLINE void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
+ int row0, int row1, int64_t *H) {
+ for (int col0 = 0; col0 < 5; col0++) {
+ int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5);
+
+ int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
+ int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
+
+ int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
+ auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
+ vst1q_s64(H + auto_cov_idx, auto_cov01);
+
+ int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
+ int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
+
+ int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
+ auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
+ vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
+
+ int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]);
+ H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4);
+ }
+}
+
+static INLINE void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
+ int row0, int row1, int64_t *H) {
+ for (int col0 = 0; col0 < 7; col0++) {
+ int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7);
+
+ int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
+ int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
+
+ int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
+ auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
+ vst1q_s64(H + auto_cov_idx, auto_cov01);
+
+ int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
+ int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
+
+ int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
+ auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
+ vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
+
+ int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4);
+ int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]);
+
+ int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0);
+ auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1);
+ vst1q_s64(H + auto_cov_idx + 4, auto_cov45);
+
+ int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]);
+ H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6);
+ }
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 7 * 7. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 49 * 49. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void compute_stats_win7_sve(int16_t *dgd_avg, int dgd_avg_stride,
+ int16_t *src_avg, int src_avg_stride,
+ int width, int height, int64_t *M,
+ int64_t *H, int downsample_factor) {
+ const int wiener_win = 7;
+ const int wiener_win2 = wiener_win * wiener_win;
+
+ // Use a predicate to compute the last columns of the block for H.
+ svbool_t pattern = svwhilelt_b16_u32(0, width % 8);
+
+ // Use intermediate matrices for H and M to perform the computation, they
+ // will be accumulated into the original H and M at the end.
+ int64_t M_trn[49];
+ memset(M_trn, 0, sizeof(M_trn));
+
+ int64_t H_tmp[49 * 49];
+ memset(H_tmp, 0, sizeof(H_tmp));
+
+ do {
+ // Cross-correlation (M).
+ for (int row = 0; row < wiener_win; row++) {
+ int j = 0;
+ while (j < width) {
+ int16x8_t dgd[7];
+ load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
+ &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]);
+ int16x8_t s = vld1q_s16(src_avg + j);
+
+ // Compute all the elements of one row of M.
+ compute_M_one_row_win7(s, dgd, M_trn, row);
+
+ j += 8;
+ }
+ }
+
+ // Auto-covariance (H).
+ int j = 0;
+ while (j <= width - 8) {
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ int16x8_t dgd0[7];
+ load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
+ &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]);
+
+ // Perform computation of the first column with itself (28 elements).
+ // For the first column this will fill the upper triangle of the 7x7
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 7x7 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[7];
+ load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+ &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
+
+ // Compute all elements from the combination of both columns (49
+ // elements).
+ compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
+ }
+ }
+ j += 8;
+ }
+
+ if (j < width) {
+ // Process remaining columns using a predicate to discard excess elements.
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ // Load first column.
+ int16x8_t dgd0[7];
+ dgd0[0] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
+ dgd0[1] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
+ dgd0[2] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
+ dgd0[3] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
+ dgd0[4] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
+ dgd0[5] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0));
+ dgd0[6] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0));
+
+ // Perform computation of the first column with itself (28 elements).
+ // For the first column this will fill the upper triangle of the 7x7
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 7x7 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[7];
+ load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+ &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
+
+ // Compute all elements from the combination of both columns (49
+ // elements).
+ compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
+ }
+ }
+ }
+ dgd_avg += downsample_factor * dgd_avg_stride;
+ src_avg += src_avg_stride;
+ } while (--height != 0);
+
+ // Transpose M_trn.
+ acc_transpose_M(M, M_trn, 7, downsample_factor);
+
+ // Copy upper triangle of H in the lower one.
+ copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor);
+}
+
+// This function computes two matrices: the cross-correlation between the src
+// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
+//
+// M is of size 5 * 5. It needs to be filled such that multiplying one element
+// from src with each element of a row of the wiener window will fill one
+// column of M. However this is not very convenient in terms of memory
+// accesses, as it means we do contiguous loads of dgd but strided stores to M.
+// As a result, we use an intermediate matrix M_trn which is instead filled
+// such that one row of the wiener window gives one row of M_trn. Once fully
+// computed, M_trn is then transposed to return M.
+//
+// H is of size 25 * 25. It is filled by multiplying every pair of elements of
+// the wiener window together. Since it is a symmetric matrix, we only compute
+// the upper triangle, and then copy it down to the lower one. Here we fill it
+// by taking each different pair of columns, and multiplying all the elements of
+// the first one with all the elements of the second one, with a special case
+// when multiplying a column by itself.
+static INLINE void compute_stats_win5_sve(int16_t *dgd_avg, int dgd_avg_stride,
+ int16_t *src_avg, int src_avg_stride,
+ int width, int height, int64_t *M,
+ int64_t *H, int downsample_factor) {
+ const int wiener_win = 5;
+ const int wiener_win2 = wiener_win * wiener_win;
+
+ // Use a predicate to compute the last columns of the block for H.
+ svbool_t pattern = svwhilelt_b16_u32(0, width % 8);
+
+ // Use intermediate matrices for H and M to perform the computation, they
+ // will be accumulated into the original H and M at the end.
+ int64_t M_trn[25];
+ memset(M_trn, 0, sizeof(M_trn));
+
+ int64_t H_tmp[25 * 25];
+ memset(H_tmp, 0, sizeof(H_tmp));
+
+ do {
+ // Cross-correlation (M).
+ for (int row = 0; row < wiener_win; row++) {
+ int j = 0;
+ while (j < width) {
+ int16x8_t dgd[5];
+ load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
+ &dgd[2], &dgd[3], &dgd[4]);
+ int16x8_t s = vld1q_s16(src_avg + j);
+
+ // Compute all the elements of one row of M.
+ compute_M_one_row_win5(s, dgd, M_trn, row);
+
+ j += 8;
+ }
+ }
+
+ // Auto-covariance (H).
+ int j = 0;
+ while (j <= width - 8) {
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ // Load first column.
+ int16x8_t dgd0[5];
+ load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
+ &dgd0[2], &dgd0[3], &dgd0[4]);
+
+ // Perform computation of the first column with itself (15 elements).
+ // For the first column this will fill the upper triangle of the 5x5
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 5x5 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[5];
+ load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+ &dgd1[2], &dgd1[3], &dgd1[4]);
+
+ // Compute all elements from the combination of both columns (25
+ // elements).
+ compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
+ }
+ }
+ j += 8;
+ }
+
+ // Process remaining columns using a predicate to discard excess elements.
+ if (j < width) {
+ for (int col0 = 0; col0 < wiener_win; col0++) {
+ int16x8_t dgd0[5];
+ dgd0[0] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
+ dgd0[1] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
+ dgd0[2] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
+ dgd0[3] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
+ dgd0[4] = svget_neonq_s16(
+ svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
+
+ // Perform computation of the first column with itself (15 elements).
+ // For the first column this will fill the upper triangle of the 5x5
+ // matrix at the top left of the H matrix. For the next columns this
+ // will fill the upper triangle of the other 5x5 matrices around H's
+ // diagonal.
+ compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
+
+ // All computation next to the matrix diagonal has already been done.
+ for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
+ // Load second column and scale based on downsampling factor.
+ int16x8_t dgd1[5];
+ load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
+ &dgd1[2], &dgd1[3], &dgd1[4]);
+
+ // Compute all elements from the combination of both columns (25
+ // elements).
+ compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
+ }
+ }
+ }
+ dgd_avg += downsample_factor * dgd_avg_stride;
+ src_avg += src_avg_stride;
+ } while (--height != 0);
+
+ // Transpose M_trn.
+ acc_transpose_M(M, M_trn, 5, downsample_factor);
+
+ // Copy upper triangle of H in the lower one.
+ copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor);
+}
+
+void av1_compute_stats_sve(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = wiener_win >> 1;
+ const int32_t width = h_end - h_start;
+ const int32_t height = v_end - v_start;
+ const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
+ memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+ memset(M, 0, sizeof(*M) * wiener_win * wiener_win);
+
+ const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height);
+ const int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+ // dgd_avg and src_avg have been memset to zero before calling this
+ // function, so round up the stride to the next multiple of 8 so that we
+ // don't have to worry about a tail loop when computing M.
+ const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8;
+ const int src_avg_stride = (width & ~7) + 8;
+
+ // Compute (dgd - avg) and store it in dgd_avg.
+ // The wiener window will slide along the dgd frame, centered on each pixel.
+ // For the top left pixel and all the pixels on the side of the frame this
+ // means half of the window will be outside of the frame. As such the actual
+ // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+ // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+ const int vert_offset = v_start - wiener_halfwin;
+ const int horiz_offset = h_start - wiener_halfwin;
+ const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+ compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
+ width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1);
+
+ // Compute (src - avg), downsample if necessary and store in src-avg.
+ const uint8_t *src_start = src + h_start + v_start * src_stride;
+ compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg,
+ src_avg_stride, width, height, downsample_factor);
+
+ const int downsample_height = height / downsample_factor;
+
+ // Since the height is not necessarily a multiple of the downsample factor,
+ // the last line of src will be scaled according to how many rows remain.
+ const int downsample_remainder = height % downsample_factor;
+
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
+ width, downsample_height, M, H, downsample_factor);
+ } else {
+ compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
+ width, downsample_height, M, H, downsample_factor);
+ }
+
+ if (downsample_remainder > 0) {
+ const int remainder_offset = height - downsample_remainder;
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_sve(
+ dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride,
+ src_avg + downsample_height * src_avg_stride, src_avg_stride, width,
+ 1, M, H, downsample_remainder);
+ } else {
+ compute_stats_win5_sve(
+ dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride,
+ src_avg + downsample_height * src_avg_stride, src_avg_stride, width,
+ 1, M, H, downsample_remainder);
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/enc_enums.h b/third_party/aom/av1/encoder/enc_enums.h
index 20cefa16a5..0a8b0f258a 100644
--- a/third_party/aom/av1/encoder/enc_enums.h
+++ b/third_party/aom/av1/encoder/enc_enums.h
@@ -12,10 +12,14 @@
#ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_
#define AOM_AV1_ENCODER_ENC_ENUMS_H_
+#include "aom_ports/mem.h"
+
#ifdef __cplusplus
extern "C" {
#endif
+#define MAX_NUM_THREADS 64
+
// This enumerator type needs to be kept aligned with the mode order in
// const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code.
enum {
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
index a9214f77c2..07382eb6cc 100644
--- a/third_party/aom/av1/encoder/encodeframe.c
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -537,7 +537,9 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
// Set the partition
if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
(sf->rt_sf.use_fast_fixed_part && x->sb_force_fixed_part == 1 &&
- !frame_is_intra_only(cm))) {
+ (!frame_is_intra_only(cm) &&
+ (!cpi->ppi->use_svc ||
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)))) {
// set a fixed-size partition
av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size;
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
index 4de5d426ce..a919bd906a 100644
--- a/third_party/aom/av1/encoder/encoder.h
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -37,6 +37,7 @@
#include "av1/encoder/av1_quantize.h"
#include "av1/encoder/block.h"
#include "av1/encoder/context_tree.h"
+#include "av1/encoder/enc_enums.h"
#include "av1/encoder/encodemb.h"
#include "av1/encoder/external_partition.h"
#include "av1/encoder/firstpass.h"
@@ -74,7 +75,6 @@
#endif
#include "aom/internal/aom_codec_internal.h"
-#include "aom_util/aom_thread.h"
#ifdef __cplusplus
extern "C" {
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
index 755535ba51..1d0092a5ed 100644
--- a/third_party/aom/av1/encoder/ethread.c
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -19,6 +19,7 @@
#include "av1/encoder/allintra_vis.h"
#include "av1/encoder/bitstream.h"
+#include "av1/encoder/enc_enums.h"
#include "av1/encoder/encodeframe.h"
#include "av1/encoder/encodeframe_utils.h"
#include "av1/encoder/encoder.h"
@@ -2520,7 +2521,7 @@ void av1_tf_do_filtering_mt(AV1_COMP *cpi) {
static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx,
int cur_dir) {
GlobalMotionInfo *gm_info = &cpi->gm_info;
- JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+ GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
int total_refs = gm_info->num_ref_frames[cur_dir];
int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir];
@@ -2551,7 +2552,7 @@ static int gm_mt_worker_hook(void *arg1, void *unused) {
AV1_COMP *cpi = thread_data->cpi;
GlobalMotionInfo *gm_info = &cpi->gm_info;
AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
- JobInfo *job_info = &gm_sync->job_info;
+ GlobalMotionJobInfo *job_info = &gm_sync->job_info;
int thread_id = thread_data->thread_id;
GlobalMotionData *gm_thread_data = &thread_data->td->gm_data;
#if CONFIG_MULTITHREAD
@@ -2689,7 +2690,7 @@ static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) {
// Implements multi-threading for global motion.
void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
- JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+ GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
av1_zero(*job_info);
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
index de46a0e1f2..2645f93e3c 100644
--- a/third_party/aom/av1/encoder/global_motion.h
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -14,9 +14,8 @@
#include "aom/aom_integer.h"
#include "aom_dsp/flow_estimation/flow_estimation.h"
-#include "aom_scale/yv12config.h"
#include "aom_util/aom_pthread.h"
-#include "aom_util/aom_thread.h"
+#include "av1/encoder/enc_enums.h"
#ifdef __cplusplus
extern "C" {
@@ -58,11 +57,11 @@ typedef struct {
// next_frame_to_process[i] will hold the count of next reference frame to be
// processed in the direction 'i'.
int8_t next_frame_to_process[MAX_DIRECTIONS];
-} JobInfo;
+} GlobalMotionJobInfo;
typedef struct {
// Data related to assigning jobs for global motion multi-threading.
- JobInfo job_info;
+ GlobalMotionJobInfo job_info;
#if CONFIG_MULTITHREAD
// Mutex lock used while dispatching jobs.
diff --git a/third_party/aom/av1/encoder/nonrd_pickmode.c b/third_party/aom/av1/encoder/nonrd_pickmode.c
index 57c74f66d5..08ecb8495a 100644
--- a/third_party/aom/av1/encoder/nonrd_pickmode.c
+++ b/third_party/aom/av1/encoder/nonrd_pickmode.c
@@ -1886,14 +1886,17 @@ static AOM_INLINE int skip_mode_by_low_temp(
static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
- int extra_prune, unsigned int sse_zeromv_norm, int more_prune) {
+ int extra_prune, unsigned int sse_zeromv_norm, int more_prune,
+ int skip_nearmv) {
const unsigned int thresh_skip_golden = 500;
if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden &&
mode == NEWMV)
return 1;
- if (bsize == BLOCK_128X128 && mode == NEWMV) return 1;
+ if ((bsize == BLOCK_128X128 && mode == NEWMV) ||
+ (skip_nearmv && mode == NEARMV))
+ return 1;
// Skip testing non-LAST if this flag is set.
if (extra_prune) {
@@ -2361,6 +2364,18 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
(*this_mode != GLOBALMV || *ref_frame != LAST_FRAME))
return true;
+ // Skip the mode if use reference frame mask flag is not set.
+ if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
+
+ // Skip mode for some modes and reference frames when
+ // force_zeromv_skip_for_blk flag is true.
+ if (x->force_zeromv_skip_for_blk &&
+ ((!(*this_mode == NEARESTMV &&
+ search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
+ *this_mode != GLOBALMV) ||
+ *ref_frame != LAST_FRAME))
+ return true;
+
if (x->sb_me_block && *ref_frame == LAST_FRAME) {
// We want to make sure to test the superblock MV:
// so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they
@@ -2400,18 +2415,6 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
mi->ref_frame[0] = *ref_frame;
mi->ref_frame[1] = *ref_frame2;
- // Skip the mode if use reference frame mask flag is not set.
- if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
-
- // Skip mode for some modes and reference frames when
- // force_zeromv_skip_for_blk flag is true.
- if (x->force_zeromv_skip_for_blk &&
- ((!(*this_mode == NEARESTMV &&
- search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
- *this_mode != GLOBALMV) ||
- *ref_frame != LAST_FRAME))
- return true;
-
// Skip compound mode based on variance of previously evaluated single
// reference modes.
if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred &&
@@ -2478,7 +2481,8 @@ static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
// properties.
if (skip_mode_by_bsize_and_ref_frame(
*this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search,
- sse_zeromv_norm, rt_sf->nonrd_aggressive_skip))
+ sse_zeromv_norm, rt_sf->nonrd_aggressive_skip,
+ rt_sf->increase_source_sad_thresh))
return true;
// Skip mode based on low temporal variance and souce sad.
diff --git a/third_party/aom/av1/encoder/partition_search.c b/third_party/aom/av1/encoder/partition_search.c
index 61d49a23f2..30ea7d9140 100644
--- a/third_party/aom/av1/encoder/partition_search.c
+++ b/third_party/aom/av1/encoder/partition_search.c
@@ -2323,8 +2323,9 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
}
if (cpi->sf.rt_sf.skip_cdef_sb) {
// cdef_strength is initialized to 1 which means skip_cdef, and is updated
- // here. Check to see is skipping cdef is allowed.
- // Always allow cdef_skip for seg_skip = 1.
+ // here. Check to see is skipping cdef is allowed. Never skip on slide/scene
+ // change, near a key frame, or when color sensitivity is set. Always allow
+ // cdef_skip for seg_skip = 1.
const int allow_cdef_skipping =
seg_skip ||
(cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad &&
@@ -2338,8 +2339,16 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
MB_MODE_INFO **mi_sb =
cm->mi_params.mi_grid_base +
get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb);
- // Do not skip if intra or new mv is picked, or color sensitivity is set.
- // Never skip on slide/scene change.
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ unsigned int thresh_spatial_var =
+ (cpi->oxcf.speed >= 11 && !is_720p_or_larger &&
+ cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN)
+ ? 400
+ : UINT_MAX;
+ // For skip_cdef_sb = 1: do not skip if allow_cdef_skipping is false or
+ // intra or new mv is picked, with possible conidition on spatial variance.
+ // For skip_cdef_sb >= 2: more aggressive mode to always skip unless
+ // allow_cdef_skipping is false and source_variance is non-zero.
if (cpi->sf.rt_sf.skip_cdef_sb >= 2) {
mi_sb[0]->cdef_strength =
mi_sb[0]->cdef_strength &&
@@ -2347,7 +2356,8 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
} else {
mi_sb[0]->cdef_strength =
mi_sb[0]->cdef_strength && allow_cdef_skipping &&
- !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV);
+ !(x->source_variance < thresh_spatial_var &&
+ (mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV));
}
// Store in the pickmode context.
ctx->mic.cdef_strength = mi_sb[0]->cdef_strength;
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
index a504535028..ce0357163d 100644
--- a/third_party/aom/av1/encoder/picklpf.c
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -257,6 +257,8 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
inter_frame_multiplier = inter_frame_multiplier << 1;
else if (cpi->rc.frame_source_sad > 50000)
inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1);
+ } else if (cpi->sf.rt_sf.use_fast_fixed_part) {
+ inter_frame_multiplier = inter_frame_multiplier << 1;
}
// These values were determined by linear fitting the result of the
// searched level for 8 bit depth:
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
index b0d0d0bb78..a431c4dada 100644
--- a/third_party/aom/av1/encoder/pickrst.c
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -1044,10 +1044,13 @@ void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
#if CONFIG_AV1_HIGHBITDEPTH
void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8,
- const uint8_t *src8, int h_start, int h_end,
+ const uint8_t *src8, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
int v_start, int v_end, int dgd_stride,
int src_stride, int64_t *M, int64_t *H,
aom_bit_depth_t bit_depth) {
+ (void)dgd_avg;
+ (void)src_avg;
int i, j, k, l;
int32_t Y[WIENER_WIN2];
const int wiener_win2 = wiener_win * wiener_win;
@@ -1659,9 +1662,10 @@ static AOM_INLINE void search_wiener(
// functions. Optimize intrinsics of HBD design similar to LBD (i.e.,
// pre-calculate d and s buffers and avoid most of the C operations).
av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
- rsc->src_buffer, limits->h_start, limits->h_end,
- limits->v_start, limits->v_end, rsc->dgd_stride,
- rsc->src_stride, M, H, cm->seq_params->bit_depth);
+ rsc->src_buffer, rsc->dgd_avg, rsc->src_avg,
+ limits->h_start, limits->h_end, limits->v_start,
+ limits->v_end, rsc->dgd_stride, rsc->src_stride, M,
+ H, cm->seq_params->bit_depth);
} else {
av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
rsc->dgd_avg, rsc->src_avg, limits->h_start,
@@ -2081,10 +2085,9 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
// and height aligned to multiple of 16 is considered for intrinsic purpose.
rsc.dgd_avg = NULL;
rsc.src_avg = NULL;
-#if HAVE_AVX2 || HAVE_NEON
- // The buffers allocated below are used during Wiener filter processing of low
- // bitdepth path. Hence, allocate the same when Wiener filter is enabled in
- // low bitdepth path.
+#if HAVE_AVX2
+ // The buffers allocated below are used during Wiener filter processing.
+ // Hence, allocate the same when Wiener filter is enabled.
if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 *
RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
@@ -2221,7 +2224,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
best_luma_unit_size);
}
-#if HAVE_AVX || HAVE_NEON
+#if HAVE_AVX2
if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
aom_free(cpi->pick_lr_ctxt.dgd_avg);
cpi->pick_lr_ctxt.dgd_avg = NULL;
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
index 256b6fc9eb..9a00042520 100644
--- a/third_party/aom/av1/encoder/speed_features.c
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -1461,7 +1461,7 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
// for resolutions below 720p.
if (speed >= 11 && !is_720p_or_larger &&
cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
- sf->rt_sf.skip_cdef_sb = 2;
+ sf->rt_sf.skip_cdef_sb = 1;
sf->rt_sf.force_only_last_ref = 1;
sf->rt_sf.selective_cdf_update = 1;
sf->rt_sf.use_nonrd_filter_search = 0;
diff --git a/third_party/aom/av1/encoder/tune_vmaf.c b/third_party/aom/av1/encoder/tune_vmaf.c
index 91db3db726..fdb7c77ebc 100644
--- a/third_party/aom/av1/encoder/tune_vmaf.c
+++ b/third_party/aom/av1/encoder/tune_vmaf.c
@@ -247,7 +247,9 @@ static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
// 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128,
// all co-efficients must be even.
-DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0, 8, 30, 52,
+// The array is of size 9 to allow passing gauss_filter + 1 to
+// _mm_loadu_si128() in prepare_coeffs_6t().
+DECLARE_ALIGNED(16, static const int16_t, gauss_filter[9]) = { 0, 8, 30, 52,
30, 8, 0, 0 };
static AOM_INLINE void gaussian_blur(const int bit_depth,
const YV12_BUFFER_CONFIG *source,
diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
index 6658ed39a8..1f76576c9e 100644
--- a/third_party/aom/av1/encoder/x86/pickrst_avx2.c
+++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
@@ -345,21 +345,27 @@ static INLINE void compute_stats_highbd_win5_opt_avx2(
}
void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8,
- const uint8_t *src8, int h_start, int h_end,
+ const uint8_t *src8, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
int v_start, int v_end, int dgd_stride,
int src_stride, int64_t *M, int64_t *H,
aom_bit_depth_t bit_depth) {
if (wiener_win == WIENER_WIN) {
+ (void)dgd_avg;
+ (void)src_avg;
compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start,
v_end, dgd_stride, src_stride, M, H,
bit_depth);
} else if (wiener_win == WIENER_WIN_CHROMA) {
+ (void)dgd_avg;
+ (void)src_avg;
compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start,
v_end, dgd_stride, src_stride, M, H,
bit_depth);
} else {
- av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
- v_end, dgd_stride, src_stride, M, H, bit_depth);
+ av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg,
+ h_start, h_end, v_start, v_end, dgd_stride,
+ src_stride, M, H, bit_depth);
}
}
#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
index 50db305802..3617d33fef 100644
--- a/third_party/aom/av1/encoder/x86/pickrst_sse4.c
+++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
@@ -524,21 +524,27 @@ static INLINE void compute_stats_highbd_win5_opt_sse4_1(
}
void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8,
- const uint8_t *src8, int h_start,
- int h_end, int v_start, int v_end,
- int dgd_stride, int src_stride, int64_t *M,
- int64_t *H, aom_bit_depth_t bit_depth) {
+ const uint8_t *src8, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
if (wiener_win == WIENER_WIN) {
+ (void)dgd_avg;
+ (void)src_avg;
compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
v_end, dgd_stride, src_stride, M, H,
bit_depth);
} else if (wiener_win == WIENER_WIN_CHROMA) {
+ (void)dgd_avg;
+ (void)src_avg;
compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
v_end, dgd_stride, src_stride, M, H,
bit_depth);
} else {
- av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
- v_end, dgd_stride, src_stride, M, H, bit_depth);
+ av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg,
+ h_start, h_end, v_start, v_end, dgd_stride,
+ src_stride, M, H, bit_depth);
}
}
#endif // CONFIG_AV1_HIGHBITDEPTH