diff options
Diffstat (limited to 'third_party/aom')
26 files changed, 694 insertions, 157 deletions
diff --git a/third_party/aom/AUTHORS b/third_party/aom/AUTHORS index ade7a1a5d0..509c0d1c9d 100644 --- a/third_party/aom/AUTHORS +++ b/third_party/aom/AUTHORS @@ -235,6 +235,7 @@ Ronald S. Bultje <rsbultje@gmail.com> Rostislav Pehlivanov <rpehlivanov@mozilla.com> Ruiling Song <ruiling.song@intel.com> Rui Ueyama <ruiu@google.com> +Ruoyu Zhong <zhongruoyu@outlook.com> Rupert Swarbrick <rupert.swarbrick@argondesign.com> Ryan Lei <ryanlei@fb.com> Ryan Overbeck <rover@google.com> diff --git a/third_party/aom/CHANGELOG b/third_party/aom/CHANGELOG index b243837d3c..b5c1afbba2 100644 --- a/third_party/aom/CHANGELOG +++ b/third_party/aom/CHANGELOG @@ -1,3 +1,39 @@ +2024-01-17 v3.8.1 + This release includes several bug fixes. This release is ABI + compatible with the last release. See + https://aomedia.googlesource.com/aom/+log/v3.8.0..v3.8.1 for all the + commits in this release. + + - Bug Fixes + * aomedia:3520: get_cubic_kernel_dbl: Assertion `0 <= x && x < 1' + failed. + * aomedia:3526: alloc_compressor_data() is called during every + aom_codec_control() call on the encoder. + * aomedia:3527: aom/av1/encoder/mcomp.c:1810: av1_full_pixel_search: + Assertion `ms_params->ms_buffers.ref->width == + ms_params->ms_buffers.src->width' failed. + * aomedia:3534: libaom encoder crashed by AOM_USAGE_ALL_INTRA and + AOM_EFLAG_NO_REF_LAST flags. + * b/310455204: Recreate workers if necessary. + * b/310548198: Update frame size in actual encoding. + * b/314858909: Do not use adaptive error estimate. + * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later. + +2024-01-18 v3.7.2 + This release includes three bug fixes. This release is ABI compatible + with the last release. See + https://aomedia.googlesource.com/aom/+log/v3.7.1..v3.7.2 for all the + commits in this release. + + - Bug Fixes + * aomedia:3520: get_cubic_kernel_dbl: Assertion `0 <= x && x < 1' + failed. + * aomedia:3526: alloc_compressor_data() is called during every + aom_codec_control() call on the encoder. Note that this partially + reverts the fix for bug aomedia:3349. + * b/310457427 and b/310766628: Only use rec_sse in CBR mode. + * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later. + 2023-11-30 v3.8.0 This release includes new codec interfaces, compression efficiency and perceptual improvements, speedup and memory optimizations and many bug diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt index 76944e6917..a02b220bdb 100644 --- a/third_party/aom/CMakeLists.txt +++ b/third_party/aom/CMakeLists.txt @@ -59,7 +59,7 @@ endif() # # We set SO_FILE_VERSION = [c-a].a.r set(LT_CURRENT 11) -set(LT_REVISION 0) +set(LT_REVISION 1) set(LT_AGE 8) math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}") set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}") diff --git a/third_party/aom/aom/src/aom_codec.c b/third_party/aom/aom/src/aom_codec.c index 512fd28196..316cc6fd23 100644 --- a/third_party/aom/aom/src/aom_codec.c +++ b/third_party/aom/aom/src/aom_codec.c @@ -170,6 +170,7 @@ void aom_internal_error(struct aom_internal_error_info *info, void aom_internal_error_copy(struct aom_internal_error_info *info, const struct aom_internal_error_info *src) { assert(info != src); + assert(!src->setjmp); if (!src->has_detail) { aom_internal_error(info, src->error_code, NULL); diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index 4b49605e53..7bb156ac59 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -1352,16 +1352,19 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/; - specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/; - specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/; - specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/; - } + if ($bd eq 8) { + specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon neon_dotprod/; + specialize "aom_highbd_${bd}_mse16x8", qw/neon neon_dotprod/; + specialize "aom_highbd_${bd}_mse8x16", qw/neon neon_dotprod/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon neon_dotprod/; + } else { + specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/; + } - specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/; - specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/; - specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/; - specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/; + } } # diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_sve.c b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c index d0058bfa90..a2c30a1688 100644 --- a/third_party/aom/aom_dsp/arm/highbd_variance_sve.c +++ b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c @@ -348,15 +348,6 @@ static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr, } #define HIGHBD_MSE_WXH_SVE(w, h) \ - uint32_t aom_highbd_8_mse##w##x##h##_sve( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \ - return *sse; \ - } \ - \ uint32_t aom_highbd_10_mse##w##x##h##_sve( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c index d8dc60c1fe..c3716b3a78 100644 --- a/third_party/aom/aom_dsp/arm/intrapred_neon.c +++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c @@ -11,6 +11,7 @@ #include <arm_neon.h> #include <assert.h> +#include <stdint.h> #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c index ee42be7393..62729133e3 100644 --- a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c +++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c @@ -22,7 +22,7 @@ static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { // Check that the fractional position is in range. // - // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. + // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. // Mathematically, this implies that 0 <= x < 1. However, in practice it is // possible to have x == 1 due to floating point rounding. This is fine, // and we still interpolate correctly if we allow x = 1. diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.c b/third_party/aom/aom_dsp/flow_estimation/corner_match.c index cef719b68d..dc7589a8c6 100644 --- a/third_party/aom/aom_dsp/flow_estimation/corner_match.c +++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.c @@ -224,7 +224,7 @@ bool av1_compute_global_motion_feature_match( *mem_alloc_failed = true; return false; } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + if (!av1_compute_corner_list(ref_pyramid, ref_corners)) { *mem_alloc_failed = true; return false; } diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.c b/third_party/aom/aom_dsp/flow_estimation/disflow.c index 147a8ab3b3..82b531c729 100644 --- a/third_party/aom/aom_dsp/flow_estimation/disflow.c +++ b/third_party/aom/aom_dsp/flow_estimation/disflow.c @@ -25,7 +25,7 @@ #include "config/aom_dsp_rtcd.h" // Amount to downsample the flow field by. -// eg. DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate +// e.g., DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate // one flow point for each 4x4 pixel region of the frame // Must be a power of 2 #define DOWNSAMPLE_SHIFT 3 @@ -66,7 +66,7 @@ static double flow_upscale_filter[2][FLOW_UPSCALE_TAPS] = { static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { // Check that the fractional position is in range. // - // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. + // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. // Mathematically, this implies that 0 <= x < 1. However, in practice it is // possible to have x == 1 due to floating point rounding. This is fine, // and we still interpolate correctly if we allow x = 1. diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c index d2b04c1973..2c5effd638 100644 --- a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c +++ b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c @@ -30,7 +30,7 @@ static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { // Check that the fractional position is in range. // - // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. + // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. // Mathematically, this implies that 0 <= x < 1. However, in practice it is // possible to have x == 1 due to floating point rounding. This is fine, // and we still interpolate correctly if we allow x = 1. diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake index 15577d0c0e..c66a748d40 100644 --- a/third_party/aom/av1/av1.cmake +++ b/third_party/aom/av1/av1.cmake @@ -406,6 +406,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_NEON_I8MM "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c") list(APPEND AOM_AV1_COMMON_INTRIN_SVE + "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_sve.c" "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2 diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_sve.c b/third_party/aom/av1/common/arm/highbd_warp_plane_sve.c new file mode 100644 index 0000000000..7a14f21846 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_warp_plane_sve.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <arm_neon.h> +#include <assert.h> +#include <stdbool.h> +#include <arm_neon_sve_bridge.h> + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/scale.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" +#include "highbd_warp_plane_neon.h" + +static INLINE int16x8_t highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, + int sx, int alpha) { + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); +} + +static INLINE int16x8_t highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, + int sx, int alpha) { + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 4); + int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 5); + int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 6); + int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 7); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f[4]); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f[5]); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f[6]); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f[7]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); +} + +static INLINE int16x8_t highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, + int sx) { + int16x8_t f = load_filters_1(sx); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); +} + +static INLINE int16x8_t highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, + int sx) { + int16x8_t f = load_filters_1(sx); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 4); + int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 5); + int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 6); + int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 7); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); +} + +static INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy) { + const int16x8_t f = load_filters_1(sy); + const int16x4_t f0123 = vget_low_s16(f); + const int16x4_t f4567 = vget_high_s16(f); + + // No benefit to using SDOT here, the cost of rearrangement is too high. + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); + return m0123; +} + +static INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy) { + const int16x8_t f = load_filters_1(sy); + const int16x4_t f0123 = vget_low_s16(f); + const int16x4_t f4567 = vget_high_s16(f); + + // No benefit to using SDOT here, the cost of rearrangement is too high. + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3); + return (int32x4x2_t){ { m0123, m4567 } }; +} + +static INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy, + int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]), + vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]), + vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + return vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); +} + +static INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy, + int gamma) { + int16x8_t s0 = tmp[0]; + int16x8_t s1 = tmp[1]; + int16x8_t s2 = tmp[2]; + int16x8_t s3 = tmp[3]; + int16x8_t s4 = tmp[4]; + int16x8_t s5 = tmp[5]; + int16x8_t s6 = tmp[6]; + int16x8_t s7 = tmp[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + int32x4x2_t ret; + ret.val[0] = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + ret.val[1] = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); + return ret; +} + +void av1_highbd_warp_affine_sve(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); +} diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl index c5fe389ba1..ef999fbba2 100644 --- a/third_party/aom/av1/common/av1_rtcd_defs.pl +++ b/third_party/aom/av1/common/av1_rtcd_defs.pl @@ -541,7 +541,7 @@ if ($opts{config} !~ /libs-x86-win32-vs.*/) { # WARPED_MOTION / GLOBAL_MOTION functions if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; - specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon/; + specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/; } add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c index 20a1e12476..f68af18cb1 100644 --- a/third_party/aom/av1/common/reconintra.c +++ b/third_party/aom/av1/common/reconintra.c @@ -1368,7 +1368,7 @@ void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd) { } } -static void highbd_build_intra_predictors( +static void highbd_build_directional_and_filter_intra_predictors( const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, @@ -1376,7 +1376,7 @@ static void highbd_build_intra_predictors( int bit_depth) { int i; uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8); DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); uint16_t *const above_row = above_data + 16; @@ -1390,7 +1390,8 @@ static void highbd_build_intra_predictors( const uint16_t *left_ref = ref - 1; const int is_dr_mode = av1_is_directional_mode(mode); const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; - int base = 128 << (bit_depth - 8); + assert(use_filter_intra || is_dr_mode); + const int base = 128 << (bit_depth - 8); // The left_data, above_data buffers must be zeroed to fix some intermittent // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are @@ -1492,49 +1493,124 @@ static void highbd_build_intra_predictors( return; } - if (is_dr_mode) { - int upsample_above = 0; - int upsample_left = 0; - if (!disable_edge_filter) { - const int need_right = p_angle < 90; - const int need_bottom = p_angle > 180; - if (p_angle != 90 && p_angle != 180) { - const int ab_le = need_above_left ? 1 : 0; - if (need_above && need_left && (txwpx + txhpx >= 24)) { - highbd_filter_intra_edge_corner(above_row, left_col); - } - if (need_above && n_top_px > 0) { - const int strength = intra_edge_filter_strength( - txwpx, txhpx, p_angle - 90, intra_edge_filter_type); - const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); - av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength); - } - if (need_left && n_left_px > 0) { - const int strength = intra_edge_filter_strength( - txhpx, txwpx, p_angle - 180, intra_edge_filter_type); - const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); - av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength); - } + assert(is_dr_mode); + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + if (p_angle != 90 && p_angle != 180) { + const int ab_le = need_above_left ? 1 : 0; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + highbd_filter_intra_edge_corner(above_row, left_col); } - upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, - intra_edge_filter_type); - if (need_above && upsample_above) { - const int n_px = txwpx + (need_right ? txhpx : 0); - av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth); + if (need_above && n_top_px > 0) { + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength); } - upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, - intra_edge_filter_type); - if (need_left && upsample_left) { - const int n_px = txhpx + (need_bottom ? txwpx : 0); - av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth); + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength); } } - highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, - upsample_above, upsample_left, p_angle, bit_depth); + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth); + } + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth); + } + } + highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, + upsample_above, upsample_left, p_angle, bit_depth); +} + +// For HBD encode/decode, this function generates the pred data of a given +// block for non-directional intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H, +// SMOOTH_V and PAETH). +static void highbd_build_non_directional_intra_predictors( + const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, + PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px, + int bit_depth) { + int i = 0; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8); + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + int need_left = extend_modes[mode] & NEED_LEFT; + int need_above = extend_modes[mode] & NEED_ABOVE; + int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + const uint16_t *above_ref = ref - ref_stride; + const uint16_t *left_ref = ref - 1; + const int base = 128 << (bit_depth - 8); + + assert(n_top_px >= 0); + assert(n_left_px >= 0); + assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || + mode == SMOOTH_H_PRED || mode == PAETH_PRED); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val = 0; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : base + 1; + } else { + val = (n_left_px > 0) ? left_ref[0] : base - 1; + } + for (i = 0; i < txhpx; ++i) { + aom_memset16(dst, val, txwpx); + dst += dst_stride; + } return; } - // predict + DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint16_t *const above_row = above_data + 16; + uint16_t *const left_col = left_data + 16; + + if (need_left) { + aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_left_px > 0) { + for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (i < txhpx) aom_memset16(&left_col[i], left_col[i - 1], txhpx - i); + } else if (n_top_px > 0) { + aom_memset16(left_col, above_ref[0], txhpx); + } + } + + if (need_above) { + aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0])); + i = n_top_px; + if (i < txwpx) aom_memset16(&above_row[i], above_row[i - 1], (txwpx - i)); + } else if (n_left_px > 0) { + aom_memset16(above_row, left_ref[0], txwpx); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = base; + } + left_col[-1] = above_row[-1]; + } + if (mode == DC_PRED) { dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size]( dst, dst_stride, above_row, left_col, bit_depth); @@ -1660,12 +1736,19 @@ void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size, // separate function build_non_directional_intra_predictors() is introduced // for these modes to avoid redundant computations while generating pred data. - // TODO(aomedia:3532): Enable this refactoring for high bd path as well. - if (!is_hbd && !use_filter_intra && !is_dr_mode) { - build_non_directional_intra_predictors( - ref, ref_stride, dst, dst_stride, mode, tx_size, - have_top ? AOMMIN(txwpx, xr + txwpx) : 0, - have_left ? AOMMIN(txhpx, yd + txhpx) : 0); + const int n_top_px = have_top ? AOMMIN(txwpx, xr + txwpx) : 0; + const int n_left_px = have_left ? AOMMIN(txhpx, yd + txhpx) : 0; + if (!use_filter_intra && !is_dr_mode) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + highbd_build_non_directional_intra_predictors( + ref, ref_stride, dst, dst_stride, mode, tx_size, n_top_px, n_left_px, + xd->bd); + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + build_non_directional_intra_predictors(ref, ref_stride, dst, dst_stride, + mode, tx_size, n_top_px, n_left_px); return; } @@ -1717,25 +1800,23 @@ void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size, const int disable_edge_filter = !enable_intra_edge_filter; const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane); + const int n_topright_px = + have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right; + const int n_bottomleft_px = + have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left; #if CONFIG_AV1_HIGHBITDEPTH if (is_hbd) { - highbd_build_intra_predictors( + highbd_build_directional_and_filter_intra_predictors( ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, - tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, - have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right, - have_left ? AOMMIN(txhpx, yd + txhpx) : 0, - have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left, - intra_edge_filter_type, xd->bd); + tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px, + n_bottomleft_px, intra_edge_filter_type, xd->bd); return; } #endif build_directional_and_filter_intra_predictors( ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, - tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, - have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right, - have_left ? AOMMIN(txhpx, yd + txhpx) : 0, - have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left, - intra_edge_filter_type); + tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px, + n_bottomleft_px, intra_edge_filter_type); } void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c index 4732ad435b..fe053af5cc 100644 --- a/third_party/aom/av1/encoder/encoder.c +++ b/third_party/aom/av1/encoder/encoder.c @@ -2594,15 +2594,19 @@ static int encode_without_recode(AV1_COMP *cpi) { if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) { const YV12_BUFFER_CONFIG *const ref = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); - if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) + if (ref == NULL || ref->y_crop_width != cm->width || + ref->y_crop_height != cm->height) { cpi->ref_frame_flags ^= AOM_GOLD_FLAG; + } } } if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) { const YV12_BUFFER_CONFIG *const ref = get_ref_frame_yv12_buf(cm, ALTREF_FRAME); - if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) + if (ref == NULL || ref->y_crop_width != cm->width || + ref->y_crop_height != cm->height) { cpi->ref_frame_flags ^= AOM_ALT_FLAG; + } } } @@ -2700,10 +2704,13 @@ static int encode_without_recode(AV1_COMP *cpi) { update_motion_stat(cpi); // Adjust the refresh of the golden (longer-term) reference based on QP - // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode. + // selected for this frame. This is for CBR real-time mode, and only + // for single layer without usage of the set_ref_frame_config (so + // reference structure for 1 layer is set internally). if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR && cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 && svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl && + !cpi->ppi->rtc_ref.set_ref_frame_config && sf->rt_sf.gf_refresh_based_on_qp) av1_adjust_gf_refresh_qp_one_pass_rt(cpi); diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h index 5f6f67eda8..e87ab9be1f 100644 --- a/third_party/aom/av1/encoder/encoder.h +++ b/third_party/aom/av1/encoder/encoder.h @@ -3156,14 +3156,14 @@ typedef struct AV1_COMP { FRAME_INDEX_SET frame_index_set; /*! - * Store the cm->width in the last call of alloc_compressor_data(). Help + * Stores the cm->width in the last call of alloc_compressor_data(). Helps * determine whether compressor data should be reallocated when cm->width * changes. */ int data_alloc_width; /*! - * Store the cm->height in the last call of alloc_compressor_data(). Help + * Stores the cm->height in the last call of alloc_compressor_data(). Helps * determine whether compressor data should be reallocated when cm->height * changes. */ diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c index 4e53447379..f3a9828cb3 100644 --- a/third_party/aom/av1/encoder/mcomp.c +++ b/third_party/aom/av1/encoder/mcomp.c @@ -1807,7 +1807,6 @@ int av1_full_pixel_search(const FULLPEL_MV start_mv, } assert(ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride); - assert(ms_params->ms_buffers.ref->width == ms_params->ms_buffers.src->width); switch (search_method) { case FAST_BIGDIA: diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c index a6c0971096..63d69cadc5 100644 --- a/third_party/aom/av1/encoder/speed_features.c +++ b/third_party/aom/av1/encoder/speed_features.c @@ -1624,6 +1624,14 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi, sf->rt_sf.use_rtc_tf = 0; sf->rt_sf.nonrd_prune_ref_frame_search = 1; } + // rtc_tf feature allocates new source because of possible + // temporal filtering which may change the input source during encoding: + // this causes an issue on resized frames when psnr is calculated, + // so disable it here for frames that are resized (encoding width/height + // different from configured width/height). + if (is_psnr_calc_enabled(cpi) && (cpi->oxcf.frm_dim_cfg.width != cm->width || + cpi->oxcf.frm_dim_cfg.height != cm->height)) + sf->rt_sf.use_rtc_tf = 0; } // TODO(kyslov): now this is very similar to diff --git a/third_party/aom/common/tools_common.c b/third_party/aom/common/tools_common.c index 4d77a1b427..db02ca6299 100644 --- a/third_party/aom/common/tools_common.c +++ b/third_party/aom/common/tools_common.c @@ -97,7 +97,7 @@ int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame) { int w = aom_img_plane_width(yuv_frame, plane); const int h = aom_img_plane_height(yuv_frame, plane); int r; - // Assuming that for nv12 we read all chroma data at one time + // Assuming that for nv12 we read all chroma data at once if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; /* Determine the correct plane based on the image format. The for-loop @@ -245,17 +245,21 @@ uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface) { void aom_img_write(const aom_image_t *img, FILE *file) { int plane; + const int bytespp = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = aom_img_plane_width(img, plane) * - ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int w = aom_img_plane_width(img, plane); const int h = aom_img_plane_height(img, plane); int y; + // Assuming that for nv12 we write all chroma data at once + if (img->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; + if (img->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; + for (y = 0; y < h; ++y) { - fwrite(buf, 1, w, file); + fwrite(buf, bytespp, w, file); buf += stride; } } @@ -268,12 +272,16 @@ bool aom_img_read(aom_image_t *img, FILE *file) { for (plane = 0; plane < 3; ++plane) { unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = aom_img_plane_width(img, plane) * bytespp; + int w = aom_img_plane_width(img, plane); const int h = aom_img_plane_height(img, plane); int y; + // Assuming that for nv12 we read all chroma data at once + if (img->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; + if (img->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; + for (y = 0; y < h; ++y) { - if (fread(buf, 1, w, file) != (size_t)w) return false; + if (fread(buf, bytespp, w, file) != (size_t)w) return false; buf += stride; } } diff --git a/third_party/aom/test/av1_c_vs_simd_encode.sh b/third_party/aom/test/av1_c_vs_simd_encode.sh index 296204d118..897ac081c1 100755 --- a/third_party/aom/test/av1_c_vs_simd_encode.sh +++ b/third_party/aom/test/av1_c_vs_simd_encode.sh @@ -104,16 +104,16 @@ av1_c_vs_simd_enc_verify_environment () { # } # Echo AOM_SIMD_CAPS_MASK for different instruction set architecture. -avx512f() { +avx2() { echo "0x1FF" } -avx2() { - echo "0x0FF" +avx() { + echo "0x17F" } -avx() { - echo "0x07F" +sse4_2() { + echo "0x13F" } sse4_1() { @@ -443,21 +443,21 @@ av1_test_generic() { done } -# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, AVX, AVX2 as there are -# no functions with MMX, SSE and AVX512 specialization. +# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, SSE4_2, AVX, AVX2 as +# there are no functions with MMX, SSE and AVX512 specialization. # The value of environment variable 'AOM_SIMD_CAPS_MASK' controls enabling of different instruction # set extension optimizations. The value of the flag 'AOM_SIMD_CAPS_MASK' and the corresponding # instruction set extension optimization enabled are as follows: -# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX -# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX512 and lower variants -# 0 1 1 1 1 1 1 1 1 -> 0x0FF -> Enable AVX2 and lower variants -# 0 0 1 1 1 1 1 1 1 -> 0x07F -> Enable AVX and lower variants -# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants -# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants -# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants -# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants -# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants -# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX +# SSE4_2 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX +# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX2 and lower variants +# 1 0 1 1 1 1 1 1 1 -> 0x17F -> Enable AVX and lower variants +# 1 0 0 1 1 1 1 1 1 -> 0x13F -> Enable SSE4_2 and lower variants +# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants +# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants +# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants +# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants +# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants +# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX ## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "AOM_SIMD_CAPS_MASK" as # all x86_64 platforms implement sse2. av1_test_x86() { @@ -478,8 +478,8 @@ av1_test_x86() { local cmake_command="cmake $LIBAOM_SOURCE_DIR" fi - # Available x86 isa variants: "avx2 avx sse4_1 ssse3 sse3 sse2" - local x86_isa_variants="avx2 sse4_1 sse2" + # Available x86 isa variants: "avx2 avx sse4_2 sse4_1 ssse3 sse3 sse2" + local x86_isa_variants="avx2 sse4_2 sse2" echo "Build for x86: ${target}" if ! av1_enc_build "${target}" "${cmake_command}"; then diff --git a/third_party/aom/test/dr_prediction_test.cc b/third_party/aom/test/dr_prediction_test.cc index 3865810e9b..c23b08e481 100644 --- a/third_party/aom/test/dr_prediction_test.cc +++ b/third_party/aom/test/dr_prediction_test.cc @@ -10,6 +10,7 @@ */ #include <tuple> +#include <vector> #include "third_party/googletest/src/googletest/include/gtest/gtest.h" @@ -18,6 +19,7 @@ #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" +#include "aom_ports/sanitizer.h" #include "av1/common/blockd.h" #include "av1/common/pred_common.h" #include "av1/common/reconintra.h" @@ -149,8 +151,6 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { protected: static const int kMaxNumTests = 10000; static const int kIterations = 10; - static const int kDstStride = 64; - static const int kDstSize = kDstStride * kDstStride; static const int kOffset = 16; static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16; @@ -161,9 +161,6 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { start_angle_ = params_.start_angle; stop_angle_ = start_angle_ + 90; - dst_ref_ = &dst_ref_data_[0]; - dst_tst_ = &dst_tst_data_[0]; - dst_stride_ = kDstStride; above_ = &above_data_[kOffset]; left_ = &left_data_[kOffset]; @@ -171,16 +168,12 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { above_data_[i] = rng_.Rand8(); left_data_[i] = rng_.Rand8(); } - - for (int i = 0; i < kDstSize; ++i) { - dst_ref_[i] = 0; - dst_tst_[i] = 0; - } } ~DrPredTest() override = default; - void Predict(bool speedtest, int tx) { + void Predict(bool speedtest, int tx, Pixel *dst_ref, Pixel *dst_tst, + int dst_stride) { const int kNumTests = speedtest ? kMaxNumTests : 1; aom_usec_timer timer; int tst_time = 0; @@ -189,7 +182,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { aom_usec_timer_start(&timer); for (int k = 0; k < kNumTests; ++k) { - params_.ref_fn(dst_ref_, dst_stride_, bw_, bh_, above_, left_, + params_.ref_fn(dst_ref, dst_stride, bw_, bh_, above_, left_, upsample_above_, upsample_left_, dx_, dy_, bd_); } aom_usec_timer_mark(&timer); @@ -198,15 +191,17 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { if (params_.tst_fn) { aom_usec_timer_start(&timer); for (int k = 0; k < kNumTests; ++k) { - API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_, + API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst, dst_stride, bw_, bh_, above_, left_, upsample_above_, upsample_left_, dx_, dy_, bd_)); } aom_usec_timer_mark(&timer); tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); } else { - for (int i = 0; i < kDstSize; ++i) { - dst_ref_[i] = dst_tst_[i]; + for (int r = 0; r < bh_; ++r) { + for (int c = 0; c < bw_; ++c) { + dst_tst[r * dst_stride + c] = dst_ref[r * dst_stride + c]; + } } } @@ -222,18 +217,6 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { } } for (int tx = 0; tx < TX_SIZES_ALL; ++tx) { - if (params_.tst_fn == nullptr) { - for (int i = 0; i < kDstSize; ++i) { - dst_tst_[i] = (1 << bd_) - 1; - dst_ref_[i] = (1 << bd_) - 1; - } - } else { - for (int i = 0; i < kDstSize; ++i) { - dst_ref_[i] = 0; - dst_tst_[i] = 0; - } - } - bw_ = tx_size_wide[kTxSize[tx]]; bh_ = tx_size_high[kTxSize[tx]]; @@ -246,12 +229,31 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { upsample_above_ = upsample_left_ = 0; } - Predict(speedtest, tx); + // Add additional padding to allow detection of over reads/writes when + // the transform width is equal to MAX_TX_SIZE. + const int dst_stride = MAX_TX_SIZE + 16; + std::vector<Pixel> dst_ref(dst_stride * bh_); + std::vector<Pixel> dst_tst(dst_stride * bh_); + + for (int r = 0; r < bh_; ++r) { + ASAN_POISON_MEMORY_REGION(&dst_ref[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + ASAN_POISON_MEMORY_REGION(&dst_tst[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + } + + Predict(speedtest, tx, dst_ref.data(), dst_tst.data(), dst_stride); + + for (int r = 0; r < bh_; ++r) { + ASAN_UNPOISON_MEMORY_REGION(&dst_ref[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + ASAN_UNPOISON_MEMORY_REGION(&dst_tst[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + } for (int r = 0; r < bh_; ++r) { for (int c = 0; c < bw_; ++c) { - ASSERT_EQ(dst_ref_[r * dst_stride_ + c], - dst_tst_[r * dst_stride_ + c]) + ASSERT_EQ(dst_ref[r * dst_stride + c], dst_tst[r * dst_stride + c]) << bw_ << "x" << bh_ << " r: " << r << " c: " << c << " dx: " << dx_ << " dy: " << dy_ << " upsample_above: " << upsample_above_ @@ -292,18 +294,12 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { } } - Pixel dst_ref_data_[kDstSize]; - Pixel dst_tst_data_[kDstSize]; - Pixel left_data_[kBufSize]; Pixel dummy_data_[kBufSize]; Pixel above_data_[kBufSize]; - Pixel *dst_ref_; - Pixel *dst_tst_; Pixel *above_; Pixel *left_; - int dst_stride_; int enable_upsample_; int upsample_above_; diff --git a/third_party/aom/test/encode_api_test.cc b/third_party/aom/test/encode_api_test.cc index aa4084f9e4..605743f9be 100644 --- a/third_party/aom/test/encode_api_test.cc +++ b/third_party/aom/test/encode_api_test.cc @@ -654,6 +654,52 @@ TEST(EncodeAPI, AllIntraMode) { cfg.kf_max_dist = 1; EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0)); } -#endif + +TEST(EncodeAPI, AllIntraAndUsePsnr) { + aom_codec_iface_t *iface = aom_codec_av1_cx(); + aom_codec_enc_cfg_t cfg; + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA), + AOM_CODEC_OK); + + aom_codec_ctx_t enc; + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_PSNR), + AOM_CODEC_OK); + + aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK); + const aom_codec_cx_pkt_t *pkt; + aom_codec_iter_t iter = nullptr; + while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) { + if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) { + ASSERT_EQ(pkt->kind, AOM_CODEC_PSNR_PKT); + } + } + + aom_img_free(image); + ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK); +} + +// A test that reproduces bug aomedia:3534. +TEST(EncodeAPI, AllIntraAndNoRefLast) { + aom_codec_iface_t *iface = aom_codec_av1_cx(); + aom_codec_enc_cfg_t cfg; + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA), + AOM_CODEC_OK); + + aom_codec_ctx_t enc; + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK); + + aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, AOM_EFLAG_NO_REF_LAST), + AOM_CODEC_OK); + + aom_img_free(image); + ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK); +} +#endif // !CONFIG_REALTIME_ONLY } // namespace diff --git a/third_party/aom/test/resize_test.cc b/third_party/aom/test/resize_test.cc index 7bad45300a..755d4e3d02 100644 --- a/third_party/aom/test/resize_test.cc +++ b/third_party/aom/test/resize_test.cc @@ -11,15 +11,17 @@ #include <climits> #include <vector> + +#include "aom/aomcx.h" #include "aom_dsp/aom_dsp_common.h" -#include "common/tools_common.h" #include "av1/encoder/encoder.h" +#include "common/tools_common.h" #include "third_party/googletest/src/googletest/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" -#include "test/video_source.h" #include "test/util.h" +#include "test/video_source.h" #include "test/y4m_video_source.h" // Enable(1) or Disable(0) writing of the compressed bitstream. @@ -403,7 +405,7 @@ class ResizeRealtimeTest ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)), num_threads_(GET_PARAM(3)), set_scale_mode_(false), set_scale_mode2_(false), - set_scale_mode3_(false) {} + set_scale_mode3_(false), is_screen_(false) {} ~ResizeRealtimeTest() override = default; void PreEncodeFrameHook(libaom_test::VideoSource *video, @@ -415,6 +417,8 @@ class ResizeRealtimeTest encoder->Control(AV1E_SET_ENABLE_OBMC, 0); encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + if (is_screen_) + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN); } if (set_scale_mode_) { struct aom_scaling_mode mode; @@ -508,6 +512,7 @@ class ResizeRealtimeTest bool set_scale_mode_; bool set_scale_mode2_; bool set_scale_mode3_; + bool is_screen_; }; // Check the AOME_SET_SCALEMODE control by downsizing to @@ -740,6 +745,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) { TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); + init_flags_ = AOM_CODEC_USE_PSNR; cfg_.g_w = 640; cfg_.g_h = 480; change_bitrate_ = true; @@ -795,6 +801,63 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { #endif } +// Verify the dynamic resizer behavior for real time, 1 pass CBR mode for +// screen content mode. Start at low target bitrate, raise the bitrate in the +// middle of the clip (at frame# = frame_change_bitrate_), scaling-up should +// occur after bitrate is increased. +TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRateScreen) { + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + init_flags_ = AOM_CODEC_USE_PSNR; + cfg_.g_w = 352; + cfg_.g_h = 288; + change_bitrate_ = true; + frame_change_bitrate_ = 120; + set_scale_mode_ = false; + set_scale_mode2_ = false; + set_scale_mode3_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + is_screen_ = true; + DefaultConfig(); + // Disable dropped frames. + cfg_.rc_dropframe_thresh = 0; + // Starting bitrate low. + cfg_.rc_target_bitrate = 100; + cfg_.rc_resize_mode = RESIZE_DYNAMIC; + cfg_.g_forced_max_frame_width = 1280; + cfg_.g_forced_max_frame_height = 1280; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + unsigned int last_w = cfg_.g_w; + unsigned int last_h = cfg_.g_h; + unsigned int frame_number = 0; + int resize_down_count = 0; + for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + if (info->w != last_w || info->h != last_h) { + if (frame_number < frame_change_bitrate_) { + // Verify that resize down occurs, before bitrate is increased. + ASSERT_LT(info->w, last_w); + ASSERT_LT(info->h, last_h); + resize_down_count++; + } + last_w = info->w; + last_h = info->h; + } + frame_number++; + } + +#if CONFIG_AV1_DECODER + // Verify that we get at least 1 resize event in this test. + ASSERT_GE(resize_down_count, 1) + << "Resizing down should occur at lease once."; + EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames()); +#else + printf("Warning: AV1 decoder unavailable, unable to check resize count!\n"); +#endif +} + class ResizeCspTest : public ResizeTest { protected: #if WRITE_COMPRESSED_STREAM diff --git a/third_party/aom/test/variance_test.cc b/third_party/aom/test/variance_test.cc index a493a1f4cb..e31f8f820c 100644 --- a/third_party/aom/test/variance_test.cc +++ b/third_party/aom/test/variance_test.cc @@ -2165,11 +2165,7 @@ INSTANTIATE_TEST_SUITE_P( MseParams(4, 4, &aom_highbd_10_mse16x16_sve, 10), MseParams(4, 3, &aom_highbd_10_mse16x8_sve, 10), MseParams(3, 4, &aom_highbd_10_mse8x16_sve, 10), - MseParams(3, 3, &aom_highbd_10_mse8x8_sve, 10), - MseParams(4, 4, &aom_highbd_8_mse16x16_sve, 8), - MseParams(4, 3, &aom_highbd_8_mse16x8_sve, 8), - MseParams(3, 4, &aom_highbd_8_mse8x16_sve, 8), - MseParams(3, 3, &aom_highbd_8_mse8x8_sve, 8))); + MseParams(3, 3, &aom_highbd_10_mse8x8_sve, 10))); #endif // HAVE_SVE const VarianceParams kArrayHBDVariance_c[] = { diff --git a/third_party/aom/test/warp_filter_test.cc b/third_party/aom/test/warp_filter_test.cc index f0be7d226b..8844ba77ca 100644 --- a/third_party/aom/test/warp_filter_test.cc +++ b/third_party/aom/test/warp_filter_test.cc @@ -88,6 +88,12 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( SVE, AV1WarpFilterTest, libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sve)); + +#if CONFIG_AV1_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SVE, AV1HighbdWarpFilterTest, + libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_sve)); +#endif // CONFIG_AV1_HIGHBITDEPTH #endif // HAVE_SVE } // namespace |