diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:34:42 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:34:42 +0000 |
commit | da4c7e7ed675c3bf405668739c3012d140856109 (patch) | |
tree | cdd868dba063fecba609a1d819de271f0d51b23e /media/libvpx | |
parent | Adding upstream version 125.0.3. (diff) | |
download | firefox-da4c7e7ed675c3bf405668739c3012d140856109.tar.xz firefox-da4c7e7ed675c3bf405668739c3012d140856109.zip |
Adding upstream version 126.0.upstream/126.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'media/libvpx')
171 files changed, 7097 insertions, 6024 deletions
diff --git a/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch b/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch new file mode 100644 index 0000000000..4788b3996a --- /dev/null +++ b/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch @@ -0,0 +1,41 @@ +# HG changeset patch +# User Chun-Min Chang <chun.m.chang@gmail.com> + +Bug 1888772 - Allow ARM CPU runtime detection code to build on OpenBSD + +diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c +--- a/vpx_ports/aarch64_cpudetect.c ++++ b/vpx_ports/aarch64_cpudetect.c +@@ -10,30 +10,30 @@ + + #include "./vpx_config.h" + #include "arm_cpudetect.h" + + #if defined(__APPLE__) + #include <sys/sysctl.h> + #endif + +-#if !CONFIG_RUNTIME_CPU_DETECT ++#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__) + + static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; + #if HAVE_NEON + flags |= HAS_NEON; + #endif // HAVE_NEON + return flags; + } + +-#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT ++#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__) + + // sysctlbyname() parameter documentation for instruction set characteristics: + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics + static INLINE int64_t have_feature(const char *feature) { + int64_t feature_present = 0; + size_t size = sizeof(feature_present); + if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) { + return 0; diff --git a/media/libvpx/config/generic/vpx_config.asm b/media/libvpx/config/generic/vpx_config.asm index 47243ad198..7a1aaf999a 100644 --- a/media/libvpx/config/generic/vpx_config.asm +++ b/media/libvpx/config/generic/vpx_config.asm @@ -13,6 +13,7 @@ .equ HAVE_NEON_DOTPROD , 0 .equ HAVE_NEON_I8MM , 0 .equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 diff --git a/media/libvpx/config/generic/vpx_config.c b/media/libvpx/config/generic/vpx_config.c index d1c3d1acd7..922edd1ea2 100644 --- a/media/libvpx/config/generic/vpx_config.c +++ b/media/libvpx/config/generic/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=generic-gnu --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512"; +static const char* const cfg = "--target=generic-gnu --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --log=/home/cm/Work/gecko-dev/media/libvpx/config/generic/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/generic/vpx_config.h b/media/libvpx/config/generic/vpx_config.h index 774a531ed9..c885bb399a 100644 --- a/media/libvpx/config/generic/vpx_config.h +++ b/media/libvpx/config/generic/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/arm/vpx_config.asm b/media/libvpx/config/linux/arm/vpx_config.asm index ee43d0f922..6be2a7f7a2 100644 --- a/media/libvpx/config/linux/arm/vpx_config.asm +++ b/media/libvpx/config/linux/arm/vpx_config.asm @@ -13,6 +13,7 @@ .equ HAVE_NEON_DOTPROD , 0 .equ HAVE_NEON_I8MM , 0 .equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 diff --git a/media/libvpx/config/linux/arm/vpx_config.c b/media/libvpx/config/linux/arm/vpx_config.c index c885d910c0..c634e2af66 100644 --- a/media/libvpx/config/linux/arm/vpx_config.c +++ b/media/libvpx/config/linux/arm/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-runtime-cpu-detect --enable-realtime-only"; +static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-runtime-cpu-detect --enable-realtime-only --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/arm/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/arm/vpx_config.h b/media/libvpx/config/linux/arm/vpx_config.h index bfd2c04e07..99a55f0ea9 100644 --- a/media/libvpx/config/linux/arm/vpx_config.h +++ b/media/libvpx/config/linux/arm/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/arm64/vp9_rtcd.h b/media/libvpx/config/linux/arm64/vp9_rtcd.h index 738de4f9f4..b7d828d446 100644 --- a/media/libvpx/config/linux/arm64/vp9_rtcd.h +++ b/media/libvpx/config/linux/arm64/vp9_rtcd.h @@ -35,11 +35,13 @@ extern "C" { int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_neon +int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -#define vp9_block_error_fp vp9_block_error_fp_neon +int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); @@ -96,6 +98,10 @@ static void setup_rtcd_internal(void) (void)flags; + vp9_block_error = vp9_block_error_neon; + if (flags & HAS_SVE) vp9_block_error = vp9_block_error_sve; + vp9_block_error_fp = vp9_block_error_fp_neon; + if (flags & HAS_SVE) vp9_block_error_fp = vp9_block_error_fp_sve; } #endif diff --git a/media/libvpx/config/linux/arm64/vpx_config.asm b/media/libvpx/config/linux/arm64/vpx_config.asm index 499c16202c..c51a76b3f6 100644 --- a/media/libvpx/config/linux/arm64/vpx_config.asm +++ b/media/libvpx/config/linux/arm64/vpx_config.asm @@ -13,6 +13,7 @@ .equ HAVE_NEON_DOTPROD , 1 .equ HAVE_NEON_I8MM , 1 .equ HAVE_SVE , 1 +.equ HAVE_SVE2 , 1 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 diff --git a/media/libvpx/config/linux/arm64/vpx_config.c b/media/libvpx/config/linux/arm64/vpx_config.c index 74baa0689c..c0d714503f 100644 --- a/media/libvpx/config/linux/arm64/vpx_config.c +++ b/media/libvpx/config/linux/arm64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only"; +static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/arm64/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/arm64/vpx_config.h b/media/libvpx/config/linux/arm64/vpx_config.h index 3c5f2e33ca..12251ee0c1 100644 --- a/media/libvpx/config/linux/arm64/vpx_config.h +++ b/media/libvpx/config/linux/arm64/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 1 #define HAVE_NEON_I8MM 1 #define HAVE_SVE 1 +#define HAVE_SVE2 1 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h index 5a9b05ca14..2c31ee4ef9 100644 --- a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h +++ b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h @@ -916,7 +916,8 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t di uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size); -#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_neon +uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size); +RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -1148,6 +1149,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon_dotprod; vpx_sse = vpx_sse_neon; if (flags & HAS_NEON_DOTPROD) vpx_sse = vpx_sse_neon_dotprod; + vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon; + if (flags & HAS_SVE) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sve; vpx_variance16x16 = vpx_variance16x16_neon; if (flags & HAS_NEON_DOTPROD) vpx_variance16x16 = vpx_variance16x16_neon_dotprod; vpx_variance16x32 = vpx_variance16x32_neon; diff --git a/media/libvpx/config/linux/ia32/vpx_config.asm b/media/libvpx/config/linux/ia32/vpx_config.asm index eaa3950d37..5a92abf939 100644 --- a/media/libvpx/config/linux/ia32/vpx_config.asm +++ b/media/libvpx/config/linux/ia32/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/ia32/vpx_config.c b/media/libvpx/config/linux/ia32/vpx_config.c index 6805ab62a8..7024ca989f 100644 --- a/media/libvpx/config/linux/ia32/vpx_config.c +++ b/media/libvpx/config/linux/ia32/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/ia32/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/ia32/vpx_config.h b/media/libvpx/config/linux/ia32/vpx_config.h index 69fd63bf02..b4cc10a906 100644 --- a/media/libvpx/config/linux/ia32/vpx_config.h +++ b/media/libvpx/config/linux/ia32/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/x64/vpx_config.asm b/media/libvpx/config/linux/x64/vpx_config.asm index 8715768a2e..148a894979 100644 --- a/media/libvpx/config/linux/x64/vpx_config.asm +++ b/media/libvpx/config/linux/x64/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/linux/x64/vpx_config.c b/media/libvpx/config/linux/x64/vpx_config.c index e4dcb394c3..f38bd16290 100644 --- a/media/libvpx/config/linux/x64/vpx_config.c +++ b/media/libvpx/config/linux/x64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86_64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86_64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/x64/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/x64/vpx_config.h b/media/libvpx/config/linux/x64/vpx_config.h index ab4439aaf4..d91509ad10 100644 --- a/media/libvpx/config/linux/x64/vpx_config.h +++ b/media/libvpx/config/linux/x64/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/mac/ia32/vpx_config.asm b/media/libvpx/config/mac/ia32/vpx_config.asm index eaa3950d37..5a92abf939 100644 --- a/media/libvpx/config/mac/ia32/vpx_config.asm +++ b/media/libvpx/config/mac/ia32/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/mac/ia32/vpx_config.c b/media/libvpx/config/mac/ia32/vpx_config.c index 3e5d3ec0f3..2ee9d0ebb0 100644 --- a/media/libvpx/config/mac/ia32/vpx_config.c +++ b/media/libvpx/config/mac/ia32/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/mac/ia32/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/mac/ia32/vpx_config.h b/media/libvpx/config/mac/ia32/vpx_config.h index 69fd63bf02..b4cc10a906 100644 --- a/media/libvpx/config/mac/ia32/vpx_config.h +++ b/media/libvpx/config/mac/ia32/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/mac/x64/vpx_config.asm b/media/libvpx/config/mac/x64/vpx_config.asm index 8715768a2e..148a894979 100644 --- a/media/libvpx/config/mac/x64/vpx_config.asm +++ b/media/libvpx/config/mac/x64/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/mac/x64/vpx_config.c b/media/libvpx/config/mac/x64/vpx_config.c index 9a06646fdc..51fceeb6e3 100644 --- a/media/libvpx/config/mac/x64/vpx_config.c +++ b/media/libvpx/config/mac/x64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86_64-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86_64-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/mac/x64/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/mac/x64/vpx_config.h b/media/libvpx/config/mac/x64/vpx_config.h index ab4439aaf4..d91509ad10 100644 --- a/media/libvpx/config/mac/x64/vpx_config.h +++ b/media/libvpx/config/mac/x64/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/win/aarch64/vpx_config.asm b/media/libvpx/config/win/aarch64/vpx_config.asm index 24eb1a8cba..32d700f1bb 100644 --- a/media/libvpx/config/win/aarch64/vpx_config.asm +++ b/media/libvpx/config/win/aarch64/vpx_config.asm @@ -12,7 +12,8 @@ .equ HAVE_NEON , 1 .equ HAVE_NEON_DOTPROD , 1 .equ HAVE_NEON_I8MM , 1 -.equ HAVE_SVE , 1 +.equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 diff --git a/media/libvpx/config/win/aarch64/vpx_config.c b/media/libvpx/config/win/aarch64/vpx_config.c index 13cc13a95d..b8f4ec8754 100644 --- a/media/libvpx/config/win/aarch64/vpx_config.c +++ b/media/libvpx/config/win/aarch64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=arm64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only"; +static const char* const cfg = "--target=arm64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only --disable-sve --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/aarch64/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/win/aarch64/vpx_config.h b/media/libvpx/config/win/aarch64/vpx_config.h index c3cc860f18..a81f868053 100644 --- a/media/libvpx/config/win/aarch64/vpx_config.h +++ b/media/libvpx/config/win/aarch64/vpx_config.h @@ -21,7 +21,8 @@ #define HAVE_NEON 1 #define HAVE_NEON_DOTPROD 1 #define HAVE_NEON_I8MM 1 -#define HAVE_SVE 1 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/win/ia32/vpx_config.asm b/media/libvpx/config/win/ia32/vpx_config.asm index cb1aa7ce6a..9c7e3ce2c2 100755 --- a/media/libvpx/config/win/ia32/vpx_config.asm +++ b/media/libvpx/config/win/ia32/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/win/ia32/vpx_config.c b/media/libvpx/config/win/ia32/vpx_config.c index 33c836213b..8cdd6c30b2 100644 --- a/media/libvpx/config/win/ia32/vpx_config.c +++ b/media/libvpx/config/win/ia32/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86-win32-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86-win32-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/ia32/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/win/ia32/vpx_config.h b/media/libvpx/config/win/ia32/vpx_config.h index 9fe256f4ad..b62188c71c 100644 --- a/media/libvpx/config/win/ia32/vpx_config.h +++ b/media/libvpx/config/win/ia32/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/config/win/x64/vpx_config.asm b/media/libvpx/config/win/x64/vpx_config.asm index a1d34d6d37..d5f5f3968e 100644 --- a/media/libvpx/config/win/x64/vpx_config.asm +++ b/media/libvpx/config/win/x64/vpx_config.asm @@ -10,6 +10,7 @@ %define HAVE_NEON_DOTPROD 0 %define HAVE_NEON_I8MM 0 %define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 diff --git a/media/libvpx/config/win/x64/vpx_config.c b/media/libvpx/config/win/x64/vpx_config.c index 8c04c1a3cf..57904c7dc6 100644 --- a/media/libvpx/config/win/x64/vpx_config.c +++ b/media/libvpx/config/win/x64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86_64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86_64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/x64/config.log"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/win/x64/vpx_config.h b/media/libvpx/config/win/x64/vpx_config.h index 068c6d2a99..448f13e4a1 100644 --- a/media/libvpx/config/win/x64/vpx_config.h +++ b/media/libvpx/config/win/x64/vpx_config.h @@ -22,6 +22,7 @@ #define HAVE_NEON_DOTPROD 0 #define HAVE_NEON_I8MM 0 #define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 diff --git a/media/libvpx/generate_sources_mozbuild.sh b/media/libvpx/generate_sources_mozbuild.sh index ef9bc696f3..4efcb54aa1 100755 --- a/media/libvpx/generate_sources_mozbuild.sh +++ b/media/libvpx/generate_sources_mozbuild.sh @@ -169,7 +169,8 @@ function gen_rtcd_header { # $1 - Header file directory. # $2 - Config command line. function gen_config_files { - ./configure $2 > /dev/null + ./configure $2 --log=$BASE_DIR/$LIBVPX_CONFIG_DIR/$1/config.log > /dev/null + echo "Log file: $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/config.log" # Disable HAVE_UNISTD_H. ( echo '/HAVE_UNISTD_H'; echo 'd' ; echo 'w' ; echo 'q' ) | ed -s vpx_config.h @@ -203,6 +204,7 @@ all_platforms="${all_platforms} --disable-avx512" x86_platforms="--enable-postproc --enable-vp9-postproc --as=yasm" arm_platforms="--enable-runtime-cpu-detect --enable-realtime-only" arm64_platforms="--enable-realtime-only" +disable_sve="--disable-sve" # Bug 1885585 gen_config_files linux/x64 "--target=x86_64-linux-gcc ${all_platforms} ${x86_platforms}" gen_config_files linux/ia32 "--target=x86-linux-gcc ${all_platforms} ${x86_platforms}" @@ -213,7 +215,7 @@ gen_config_files win/ia32 "--target=x86-win32-gcc ${all_platforms} ${x86_platfor gen_config_files linux/arm "--target=armv7-linux-gcc ${all_platforms} ${arm_platforms}" gen_config_files linux/arm64 "--target=arm64-linux-gcc ${all_platforms} ${arm64_platforms}" -gen_config_files win/aarch64 "--target=arm64-win64-vs15 ${all_platforms} ${arm64_platforms}" +gen_config_files win/aarch64 "--target=arm64-win64-vs15 ${all_platforms} ${arm64_platforms} ${disable_sve}" # Bug 1885585 gen_config_files generic "--target=generic-gnu ${all_platforms}" @@ -236,7 +238,7 @@ gen_rtcd_header win/ia32 x86 gen_rtcd_header linux/arm armv7 gen_rtcd_header linux/arm64 arm64 -gen_rtcd_header win/aarch64 arm64 +gen_rtcd_header win/aarch64 arm64 $disable_sve # Bug 1885585 gen_rtcd_header generic generic @@ -275,6 +277,7 @@ config=$(print_config linux/arm64) make_clean make libvpx_srcs.txt target=libs $config > /dev/null convert_srcs_to_project_files libvpx_srcs.txt ARM64 +# Bug 1885585: The sve files will be excluded from the win/aarch64 build in moz.build. echo "Generate generic source list." config=$(print_config generic) diff --git a/media/libvpx/input_frame_validation.patch b/media/libvpx/input_frame_validation.patch index 1cb33e192f..37f755e022 100644 --- a/media/libvpx/input_frame_validation.patch +++ b/media/libvpx/input_frame_validation.patch @@ -8,15 +8,15 @@ MozReview-Commit-ID: BxDCnJe0mzs diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c -@@ -921,20 +921,29 @@ static vpx_codec_err_t vp8e_encode(vpx_c - dst_time_stamp = - pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; - dst_end_time_stamp = (pts_val + (int64_t)duration) * - ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; +@@ -989,20 +989,29 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, + &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts + duration to ticks would overflow"); + } + dst_end_time_stamp = + pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; - if (img != NULL) { res = image2yuvconfig(img, &sd); - + - if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd, - dst_time_stamp, dst_end_time_stamp)) { - VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; diff --git a/media/libvpx/libvpx/.mailmap b/media/libvpx/libvpx/.mailmap index bb0ddd95b2..7206b5ebec 100644 --- a/media/libvpx/libvpx/.mailmap +++ b/media/libvpx/libvpx/.mailmap @@ -20,6 +20,7 @@ Hui Su <huisu@google.com> Jacky Chen <jackychen@google.com> Jim Bankoski <jimbankoski@google.com> Johann Koenig <johannkoenig@google.com> +Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com> Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com> Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org> Johann <johann@duck.com> <johann.koenig@gmail.com> @@ -53,4 +54,4 @@ Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com> Yaowu Xu <yaowu@google.com> <Yaowu Xu> Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com> Vitaly Buka <vitalybuka@chromium.org> <vitlaybuka@chromium.org> -xiwei gu <guxiwei-hf@loongson.cn> +Xiwei Gu <guxiwei-hf@loongson.cn> diff --git a/media/libvpx/libvpx/AUTHORS b/media/libvpx/libvpx/AUTHORS index 2db4a113e4..5515e26589 100644 --- a/media/libvpx/libvpx/AUTHORS +++ b/media/libvpx/libvpx/AUTHORS @@ -25,6 +25,7 @@ Andrew Salkeld <andrew.salkeld@arm.com> Angie Chen <yunqi@google.com> Angie Chiang <angiebird@google.com> Anton Venema <anton.venema@liveswitch.com> +Anupam Pandey <anupam.pandey@ittiam.com> Aron Rosenberg <arosenberg@logitech.com> Attila Nagy <attilanagy@google.com> Birk Magnussen <birk.magnussen@googlemail.com> @@ -34,6 +35,8 @@ Brion Vibber <bvibber@wikimedia.org> changjun.yang <changjun.yang@intel.com> Charles 'Buck' Krasic <ckrasic@google.com> Cheng Chen <chengchen@google.com> +Chen Wang <wangchen20@iscas.ac.cn> +Cherma Rajan A <cherma.rajan@ittiam.com> Chi Yo Tsai <chiyotsai@google.com> chm <chm@rock-chips.com> Chris Cunningham <chcunningham@chromium.org> @@ -60,6 +63,8 @@ Fritz Koenig <frkoenig@google.com> Fyodor Kyslov <kyslov@google.com> Gabriel Marin <gmx@chromium.org> Gaute Strokkenes <gaute.strokkenes@broadcom.com> +George Steed <george.steed@arm.com> +Gerda Zsejke More <gerdazsejke.more@arm.com> Geza Lore <gezalore@gmail.com> Ghislain MARY <ghislainmary2@gmail.com> Giuseppe Scrivano <gscrivano@gnu.org> @@ -103,6 +108,7 @@ Jin Bo <jinbo@loongson.cn> Jingning Han <jingning@google.com> Joel Fernandes <joelaf@google.com> Joey Parrish <joeyparrish@google.com> +Johann <johann@duck.com> Johann Koenig <johannkoenig@google.com> John Koleszar <jkoleszar@google.com> Johnny Klonaris <google@jawknee.com> @@ -120,6 +126,7 @@ KO Myung-Hun <komh@chollian.net> Konstantinos Margaritis <konma@vectorcamp.gr> Kyle Siefring <kylesiefring@gmail.com> Lawrence Velázquez <larryv@macports.org> +L. E. Segovia <amy@amyspark.me> Linfeng Zhang <linfengz@google.com> Liu Peng <pengliu.mail@gmail.com> Lou Quillio <louquillio@google.com> @@ -147,6 +154,7 @@ Mirko Bonadei <mbonadei@google.com> Moriyoshi Koizumi <mozo@mozo.jp> Morton Jonuschat <yabawock@gmail.com> Nathan E. Egge <negge@mozilla.com> +Neeraj Gadgil <neeraj.gadgil@ittiam.com> Neil Birkbeck <neil.birkbeck@gmail.com> Nico Weber <thakis@chromium.org> Niveditha Rau <niveditha.rau@gmail.com> @@ -213,7 +221,8 @@ Vitaly Buka <vitalybuka@chromium.org> Vlad Tsyrklevich <vtsyrklevich@chromium.org> Wan-Teh Chang <wtc@google.com> Wonkap Jang <wonkap@google.com> -xiwei gu <guxiwei-hf@loongson.cn> +Xiahong Bao <xiahong.bao@nxp.com> +Xiwei Gu <guxiwei-hf@loongson.cn> Yaowu Xu <yaowu@google.com> Yi Luo <luoyi@google.com> Yongzhe Wang <yongzhe@google.com> diff --git a/media/libvpx/libvpx/CHANGELOG b/media/libvpx/libvpx/CHANGELOG index 21070785ed..87f0d7f708 100644 --- a/media/libvpx/libvpx/CHANGELOG +++ b/media/libvpx/libvpx/CHANGELOG @@ -1,7 +1,79 @@ -20yy-mm-dd v1.14.0 "V Duck" +2024-01-02 v1.14.0 "Venetian Duck" This release drops support for old C compilers, such as Visual Studio 2012 and older, that disallow mixing variable declarations and statements (a C99 - feature). + feature). It adds support for run-time CPU feature detection for Arm + platforms, as well as support for darwin23 (macOS 14). + + - Upgrading: + This release is ABI incompatible with the previous release. + + Various new features for rate control library for real-time: SVC parallel + encoding, loopfilter level, support for frame dropping, and screen content. + + New callback function send_tpl_gop_stats for vp9 external rate control + library, which can be used to transmit TPL stats for a group of pictures. A + public header vpx_tpl.h is added for the definition of TPL stats used in + this callback. + + libwebm is upgraded to libwebm-1.0.0.29-9-g1930e3c. + + - Enhancement: + Improvements on Neon optimizations: VoD: 12-35% speed up for bitdepth 8, + 68%-151% speed up for high bitdepth. + + Improvements on AVX2 and SSE optimizations. + Improvements on LSX optimizations for LoongArch. + 42-49% speedup on speed 0 VoD encoding. + Android API level predicates. + + - Bug fixes: + Fix to missing prototypes from the rtcd header. + Fix to segfault when total size is enlarged but width is smaller. + Fix to the build for arm64ec using MSVC. + Fix to copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic. + Fix to -Wshadow warnings. + Fix to heap overflow in vpx_get4x4sse_cs_neon. + Fix to buffer overrun in highbd Neon subpel variance filters. + Added bitexact encode test script. + Fix to -Wl,-z,defs with Clang's sanitizers. + Fix to decoder stability after error & continued decoding. + Fix to mismatch of VP9 encode with NEON intrinsics with C only version. + Fix to Arm64 MSVC compile vpx_highbd_fdct4x4_neon. + Fix to fragments count before use. + Fix to a case where target bandwidth is 0 for SVC. + Fix mask in vp9_quantize_avx2,highbd_get_max_lane_eob. + Fix to int overflow in vp9_calc_pframe_target_size_one_pass_cbr. + Fix to integer overflow in vp8,ratectrl.c. + Fix to integer overflow in vp9 svc. + Fix to avg_frame_bandwidth overflow. + Fix to per frame qp for temporal layers. + Fix to unsigned integer overflow in sse computation. + Fix to uninitialized mesh feature for BEST mode. + Fix to overflow in highbd temporal_filter. + Fix to unaligned loads w/w==4 in vpx_convolve_copy_neon. + Skip arm64_neon.h workaround w/VS >= 2019. + Fix to c vs avx mismatch of diamond_search_sad(). + Fix to c vs intrinsic mismatch of vpx_hadamard_32x32() function. + Fix to a bug in vpx_hadamard_32x32_neon(). + Fix to Clang -Wunreachable-code-aggressive warnings. + Fix to a bug in vpx_highbd_hadamard_32x32_neon(). + Fix to -Wunreachable-code in mfqe_partition. + Force mode search on 64x64 if no mode is selected. + Fix to ubsan failure caused by left shift of negative. + Fix to integer overflow in calc_pframe_target_size. + Fix to float-cast-overflow in vp8_change_config(). + Fix to a null ptr before use. + Conditionally skip using inter frames in speed features. + Remove invalid reference frames. + Disable intra mode search speed features conditionally. + Set nonrd keyframe under dynamic change of deadline for rtc. + Fix to scaled reference offsets. + Set skip_recode=0 in nonrd_pick_sb_modes. + Fix to an edge case when downsizing to one. + Fix to a bug in frame scaling. + Fix to pred buffer stride. + Fix to a bug in simple motion search. + Update frame size in actual encoding. 2023-09-29 v1.13.1 "Ugly Duckling" This release contains two security related fixes. One each for VP8 and VP9. diff --git a/media/libvpx/libvpx/README b/media/libvpx/libvpx/README index 4c25b15d81..6dbd164c34 100644 --- a/media/libvpx/libvpx/README +++ b/media/libvpx/libvpx/README @@ -1,5 +1,3 @@ -v1.13.1 Ugly Duckling - Welcome to the WebM VP8/VP9 Codec SDK! COMPILING THE APPLICATIONS/LIBRARIES: @@ -183,6 +181,44 @@ CODE STYLE: See also: http://clang.llvm.org/docs/ClangFormat.html +PROFILE GUIDED OPTIMIZATION (PGO) + Profile Guided Optimization can be enabled for Clang builds using the + commands: + + $ export CC=clang + $ export CXX=clang++ + $ ../libvpx/configure --enable-profile + $ make + + Generate one or multiple PGO profile files by running vpxdec or vpxenc. For + example: + + $ ./vpxdec ../vpx/out_ful/vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm \ + -o - > /dev/null + + To convert and merge the raw profile files, use the llvm-profdata tool: + + $ llvm-profdata merge -o perf.profdata default_8382761441159425451_0.profraw + + Then, rebuild the project with the new profile file: + + $ make clean + $ ../libvpx/configure --use-profile=perf.profdata + $ make + + Note: Always use the llvm-profdata from the toolchain that is used for + compiling the PGO-enabled binary. + + To observe the improvements from a PGO-enabled build, enable and compare the + list of failed optimizations by using the -Rpass-missed compiler flag. For + example, to list the failed loop vectorizations: + + $ ../libvpx/configure --use-profile=perf.profdata \ + --extra-cflags=-Rpass-missed=loop-vectorize + + For guidance on utilizing PGO files to identify potential optimization + opportunities, see: tools/README.pgo.md + SUPPORT This library is an open source project supported by its community. Please email webm-discuss@webmproject.org for help. diff --git a/media/libvpx/libvpx/build/make/Android.mk b/media/libvpx/libvpx/build/make/Android.mk index ba24f541b1..533f43c1c2 100644 --- a/media/libvpx/libvpx/build/make/Android.mk +++ b/media/libvpx/libvpx/build/make/Android.mk @@ -15,13 +15,9 @@ ifdef NDK_ROOT # In an Android project place a libvpx checkout in the jni directory. # Run the configure script from the jni directory. Base libvpx # encoder/decoder configuration will look similar to: -# ./libvpx/configure --target=armv7-android-gcc --disable-examples \ +# ./libvpx/configure --target=arm64-android-gcc --disable-examples \ # --enable-external-build # -# When targeting Android, realtime-only is enabled by default. This can -# be overridden by adding the command line flag: -# --disable-realtime-only -# # This will create .mk files that contain variables that contain the # source files to compile. # @@ -38,11 +34,14 @@ ifdef NDK_ROOT # but the resulting library *must* be run on devices supporting all of the # enabled extensions. They can be disabled individually with # --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512} -# --disable-neon[-asm] +# --disable-neon{, -asm, -neon-dotprod, -neon-i8mm} +# --disable-sve # --disable-{dspr2, msa} # -# Running ndk-build will build libvpx and include it in your project. +# Running ndk-build will build libvpx and include it in your project. Set +# APP_ABI to match the --target passed to configure: +# https://developer.android.com/ndk/guides/application_mk#app_abi. # CONFIG_DIR := $(LOCAL_PATH)/ diff --git a/media/libvpx/libvpx/build/make/Makefile b/media/libvpx/libvpx/build/make/Makefile index 199ed78058..658b37617b 100644 --- a/media/libvpx/libvpx/build/make/Makefile +++ b/media/libvpx/libvpx/build/make/Makefile @@ -150,6 +150,8 @@ $(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm $(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm $(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve $(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve +$(BUILD_PFX)%_sve2.c.d: CFLAGS += -march=armv9-a+sve2 +$(BUILD_PFX)%_sve2.c.o: CFLAGS += -march=armv9-a+sve2 # POWER $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx diff --git a/media/libvpx/libvpx/build/make/configure.sh b/media/libvpx/libvpx/build/make/configure.sh index 869793a296..009bf7db5c 100644 --- a/media/libvpx/libvpx/build/make/configure.sh +++ b/media/libvpx/libvpx/build/make/configure.sh @@ -74,6 +74,8 @@ Build options: --cpu=CPU optimize for a specific cpu rather than a family --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS [$CFLAGS] --extra-cxxflags=ECXXFLAGS add ECXXFLAGS to CXXFLAGS [$CXXFLAGS] + --use-profile=PROFILE_FILE + Use PROFILE_FILE for PGO ${toggle_extra_warnings} emit harmless warnings (always non-fatal) ${toggle_werror} treat warnings as errors, if possible (not available with all compilers) @@ -81,6 +83,7 @@ Build options: ${toggle_pic} turn on/off Position Independent Code ${toggle_ccache} turn on/off compiler cache ${toggle_debug} enable/disable debug mode + ${toggle_profile} enable/disable profiling ${toggle_gprof} enable/disable gprof profiling instrumentation ${toggle_gcov} enable/disable gcov coverage instrumentation ${toggle_thumb} enable/disable building arm assembly in thumb mode @@ -429,6 +432,26 @@ check_gcc_machine_options() { fi } +check_neon_sve_bridge_compiles() { + if enabled sve; then + check_cc -march=armv8.2-a+dotprod+i8mm+sve <<EOF +#ifndef __ARM_NEON_SVE_BRIDGE +#error 1 +#endif +#include <arm_sve.h> +#include <arm_neon_sve_bridge.h> +EOF + compile_result=$? + if [ ${compile_result} -ne 0 ]; then + log_echo " disabling sve: arm_neon_sve_bridge.h not supported by compiler" + log_echo " disabling sve2: arm_neon_sve_bridge.h not supported by compiler" + disable_feature sve + disable_feature sve2 + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sve --disable-sve2 " + fi + fi +} + check_gcc_avx512_compiles() { if disabled gcc; then return @@ -611,6 +634,9 @@ process_common_cmdline() { --extra-cxxflags=*) extra_cxxflags="${optval}" ;; + --use-profile=*) + pgo_file=${optval} + ;; --enable-?*|--disable-?*) eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'` if is_in ${option} ${ARCH_EXT_LIST}; then @@ -951,7 +977,7 @@ EOF add_cflags "-mmacosx-version-min=10.15" add_ldflags "-mmacosx-version-min=10.15" ;; - *-darwin2[0-2]-*) + *-darwin2[0-3]-*) add_cflags "-arch ${toolchain%%-*}" add_ldflags "-arch ${toolchain%%-*}" ;; @@ -980,36 +1006,18 @@ EOF case ${toolchain} in arm*) soft_enable runtime_cpu_detect - # Arm ISA extensions are treated as supersets. - case ${tgt_isa} in - arm64|armv8) - for ext in ${ARCH_EXT_LIST_AARCH64}; do - # Disable higher order extensions to simplify dependencies. - if [ "$disable_exts" = "yes" ]; then - if ! disabled $ext; then - RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " - disable_feature $ext - fi - elif disabled $ext; then - disable_exts="yes" - else - soft_enable $ext - fi - done - ;; - armv7|armv7s) - soft_enable neon - # Only enable neon_asm when neon is also enabled. - enabled neon && soft_enable neon_asm - # If someone tries to force it through, die. - if disabled neon && enabled neon_asm; then - die "Disabling neon while keeping neon-asm is not supported" - fi - ;; - esac - asm_conversion_cmd="cat" + if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then + soft_enable neon + # Only enable neon_asm when neon is also enabled. + enabled neon && soft_enable neon_asm + # If someone tries to force it through, die. + if disabled neon && enabled neon_asm; then + die "Disabling neon while keeping neon-asm is not supported" + fi + fi + asm_conversion_cmd="cat" case ${tgt_cc} in gcc) link_with_cc=gcc @@ -1228,6 +1236,38 @@ EOF fi ;; esac + + # AArch64 ISA extensions are treated as supersets. + if [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then + aarch64_arch_flag_neon="arch=armv8-a" + aarch64_arch_flag_neon_dotprod="arch=armv8.2-a+dotprod" + aarch64_arch_flag_neon_i8mm="arch=armv8.2-a+dotprod+i8mm" + aarch64_arch_flag_sve="arch=armv8.2-a+dotprod+i8mm+sve" + aarch64_arch_flag_sve2="arch=armv9-a+sve2" + for ext in ${ARCH_EXT_LIST_AARCH64}; do + if [ "$disable_exts" = "yes" ]; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " + soft_disable $ext + else + # Check the compiler supports the -march flag for the extension. + # This needs to happen after toolchain/OS inspection so we handle + # $CROSS etc correctly when checking for flags, else these will + # always fail. + flag="$(eval echo \$"aarch64_arch_flag_${ext}")" + check_gcc_machine_option "${flag}" "${ext}" + if ! enabled $ext; then + # Disable higher order extensions to simplify dependencies. + disable_exts="yes" + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " + soft_disable $ext + fi + fi + done + if enabled sve; then + check_neon_sve_bridge_compiles + fi + fi + ;; mips*) link_with_cc=gcc @@ -1484,6 +1524,14 @@ EOF ;; esac + # Enable PGO + if [ -n "${pgo_file}" ]; then + check_add_cflags -fprofile-use=${pgo_file} || \ + die "-fprofile-use is not supported by compiler" + check_add_ldflags -fprofile-use=${pgo_file} || \ + die "-fprofile-use is not supported by linker" + fi + # Try to enable CPU specific tuning if [ -n "${tune_cpu}" ]; then if [ -n "${tune_cflags}" ]; then @@ -1504,6 +1552,9 @@ EOF else check_add_cflags -DNDEBUG fi + enabled profile && + check_add_cflags -fprofile-generate && + check_add_ldflags -fprofile-generate enabled gprof && check_add_cflags -pg && check_add_ldflags -pg enabled gcov && diff --git a/media/libvpx/libvpx/build/make/rtcd.pl b/media/libvpx/libvpx/build/make/rtcd.pl index 0b9e16738e..025238d678 100755 --- a/media/libvpx/libvpx/build/make/rtcd.pl +++ b/media/libvpx/libvpx/build/make/rtcd.pl @@ -487,7 +487,7 @@ if ($opts{arch} eq 'x86') { @ALL_ARCHS = filter(qw/neon_asm neon/); arm; } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { - @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/); + @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve sve2/); @REQUIRES = filter(qw/neon/); &require(@REQUIRES); arm; diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure index b212e0709d..97e78996e8 100755 --- a/media/libvpx/libvpx/configure +++ b/media/libvpx/libvpx/configure @@ -260,6 +260,7 @@ ARCH_EXT_LIST_AARCH64=" neon_dotprod neon_i8mm sve + sve2 " ARCH_EXT_LIST_X86=" @@ -376,6 +377,7 @@ CMDLINE_SELECT=" install_libs install_srcs debug + profile gprof gcov pic @@ -659,6 +661,7 @@ process_toolchain() { check_add_cflags -Wmissing-declarations check_add_cflags -Wmissing-prototypes check_add_cflags -Wshadow + check_add_cflags -Wstrict-prototypes check_add_cflags -Wuninitialized check_add_cflags -Wunreachable-code-aggressive check_add_cflags -Wunused @@ -677,6 +680,10 @@ process_toolchain() { # would be needed to apply this only to test/*.cc. check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32 + # Do not allow implicit vector type conversions on Clang builds (this + # is already the default on GCC builds). + check_add_cflags -flax-vector-conversions=none + # Quiet gcc 6 vs 7 abi warnings: # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728 if enabled arm; then diff --git a/media/libvpx/libvpx/examples/resize_util.c b/media/libvpx/libvpx/examples/resize_util.c index 5fb63e1660..083bd2519d 100644 --- a/media/libvpx/libvpx/examples/resize_util.c +++ b/media/libvpx/libvpx/examples/resize_util.c @@ -20,7 +20,7 @@ static const char *exec_name = NULL; -static void usage() { +static void usage(void) { printf("Usage:\n"); printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ", exec_name); diff --git a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c index 998e4fb20d..4050c093cd 100644 --- a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c @@ -1156,12 +1156,13 @@ int main(int argc, const char **argv) { #if CONFIG_VP9_DECODER && !SIMULCAST_MODE vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id); // Don't look for mismatch on top spatial and top temporal layers as they - // are non reference frames. + // are non reference frames. Don't look at frames whose top spatial layer + // is dropped. if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) && + cx_pkt->data.frame + .spatial_layer_encoded[enc_cfg.ss_number_layers - 1] && !(layer_id.temporal_layer_id > 0 && - layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1 && - cx_pkt->data.frame - .spatial_layer_encoded[enc_cfg.ss_number_layers - 1])) { + layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1)) { test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen); } #endif diff --git a/media/libvpx/libvpx/examples/vp9cx_set_ref.c b/media/libvpx/libvpx/examples/vp9cx_set_ref.c index 1a0823153b..6e12d668b0 100644 --- a/media/libvpx/libvpx/examples/vp9cx_set_ref.c +++ b/media/libvpx/libvpx/examples/vp9cx_set_ref.c @@ -60,7 +60,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile> " "<frame> <limit(optional)>\n", diff --git a/media/libvpx/libvpx/libs.doxy_template b/media/libvpx/libvpx/libs.doxy_template index 1ee442af3e..6d05162d00 100644 --- a/media/libvpx/libvpx/libs.doxy_template +++ b/media/libvpx/libvpx/libs.doxy_template @@ -1223,14 +1223,6 @@ DOT_GRAPH_MAX_NODES = 50 MAX_DOT_GRAPH_DEPTH = 0 -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, which results in a white background. -# Warning: Depending on the platform used, enabling this option may lead to -# badly anti-aliased labels on the edges of a graph (i.e. they become hard to -# read). - -DOT_TRANSPARENT = YES - # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) diff --git a/media/libvpx/libvpx/libs.mk b/media/libvpx/libvpx/libs.mk index ff1c569c3b..5964386710 100644 --- a/media/libvpx/libvpx/libs.mk +++ b/media/libvpx/libvpx/libs.mk @@ -313,9 +313,9 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current # SO_VERSION_* then follow the rules in the link to detemine the new version # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 -SO_VERSION_MAJOR := 8 +SO_VERSION_MAJOR := 9 SO_VERSION_MINOR := 0 -SO_VERSION_PATCH := 1 +SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib SHARED_LIB_SUF := .dylib diff --git a/media/libvpx/libvpx/test/android/get_files.py b/media/libvpx/libvpx/test/android/get_files.py index 1c69740d2b..98ce7b1947 100644 --- a/media/libvpx/libvpx/test/android/get_files.py +++ b/media/libvpx/libvpx/test/android/get_files.py @@ -38,7 +38,7 @@ def get_file_sha(filename): buf = file.read(HASH_CHUNK) return sha_hash.hexdigest() except IOError: - print "Error reading " + filename + print("Error reading " + filename) # Downloads a file from a url, and then checks the sha against the passed # in sha @@ -67,7 +67,7 @@ try: getopt.getopt(sys.argv[1:], \ "u:i:o:", ["url=", "input_csv=", "output_dir="]) except: - print 'get_files.py -u <url> -i <input_csv> -o <output_dir>' + print('get_files.py -u <url> -i <input_csv> -o <output_dir>') sys.exit(2) for opt, arg in opts: @@ -79,7 +79,7 @@ for opt, arg in opts: local_resource_path = os.path.join(arg) if len(sys.argv) != 7: - print "Expects two paths and a url!" + print("Expects two paths and a url!") exit(1) if not os.path.isdir(local_resource_path): @@ -89,7 +89,7 @@ file_list_csv = open(file_list_path, "rb") # Our 'csv' file uses multiple spaces as a delimiter, python's # csv class only uses single character delimiters, so we convert them below -file_list_reader = csv.reader((re.sub(' +', ' ', line) \ +file_list_reader = csv.reader((re.sub(' +', ' ', line.decode('utf-8')) \ for line in file_list_csv), delimiter = ' ') file_shas = [] @@ -104,15 +104,16 @@ for row in file_list_reader: file_list_csv.close() # Download files, only if they don't already exist and have correct shas -for filename, sha in itertools.izip(file_names, file_shas): +for filename, sha in zip(file_names, file_shas): + filename = filename.lstrip('*') path = os.path.join(local_resource_path, filename) if os.path.isfile(path) \ and get_file_sha(path) == sha: - print path + ' exists, skipping' + print(path + ' exists, skipping') continue for retry in range(0, ftp_retries): - print "Downloading " + path + print("Downloading " + path) if not download_and_check_sha(url, filename, sha): - print "Sha does not match, retrying..." + print("Sha does not match, retrying...") else: break diff --git a/media/libvpx/libvpx/test/avg_test.cc b/media/libvpx/libvpx/test/avg_test.cc index ede9c0ba8c..7816912ff7 100644 --- a/media/libvpx/libvpx/test/avg_test.cc +++ b/media/libvpx/libvpx/test/avg_test.cc @@ -719,6 +719,15 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(1024, &vp9_block_error_fp_neon))); #endif // HAVE_NEON +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_sve), + make_tuple(64, &vp9_block_error_fp_sve), + make_tuple(256, &vp9_block_error_fp_sve), + make_tuple(1024, &vp9_block_error_fp_sve))); +#endif // HAVE_SVE + #if HAVE_MSA INSTANTIATE_TEST_SUITE_P( MSA, AverageTest, diff --git a/media/libvpx/libvpx/test/codec_factory.h b/media/libvpx/libvpx/test/codec_factory.h index c7e8f54847..179ccdf011 100644 --- a/media/libvpx/libvpx/test/codec_factory.h +++ b/media/libvpx/libvpx/test/codec_factory.h @@ -164,7 +164,9 @@ const libvpx_test::VP8CodecFactory kVP8; &libvpx_test::kVP8)), \ __VA_ARGS__)) #else -#define VP8_INSTANTIATE_TEST_SUITE(test, ...) +// static_assert() is used to avoid warnings about an extra ';' outside of a +// function. +#define VP8_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP8 == 0, "") #endif // CONFIG_VP8 /* @@ -259,7 +261,9 @@ const libvpx_test::VP9CodecFactory kVP9; &libvpx_test::kVP9)), \ __VA_ARGS__)) #else -#define VP9_INSTANTIATE_TEST_SUITE(test, ...) +// static_assert() is used to avoid warnings about an extra ';' outside of a +// function. +#define VP9_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP9 == 0, "") #endif // CONFIG_VP9 } // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/convolve_test.cc b/media/libvpx/libvpx/test/convolve_test.cc index ffd5c41c63..11f7625137 100644 --- a/media/libvpx/libvpx/test/convolve_test.cc +++ b/media/libvpx/libvpx/test/convolve_test.cc @@ -1218,6 +1218,24 @@ WRAP(convolve8_neon, 12) WRAP(convolve8_avg_neon, 12) #endif // HAVE_NEON +#if HAVE_SVE +WRAP(convolve8_horiz_sve, 8) +WRAP(convolve8_avg_horiz_sve, 8) +WRAP(convolve8_horiz_sve, 10) +WRAP(convolve8_avg_horiz_sve, 10) +WRAP(convolve8_horiz_sve, 12) +WRAP(convolve8_avg_horiz_sve, 12) +#endif // HAVE_SVE + +#if HAVE_SVE2 +WRAP(convolve8_vert_sve2, 8) +WRAP(convolve8_avg_vert_sve2, 8) +WRAP(convolve8_vert_sve2, 10) +WRAP(convolve8_avg_vert_sve2, 10) +WRAP(convolve8_vert_sve2, 12) +WRAP(convolve8_avg_vert_sve2, 12) +#endif // HAVE_SVE2 + WRAP(convolve_copy_c, 8) WRAP(convolve_avg_c, 8) WRAP(convolve8_horiz_c, 8) @@ -1438,6 +1456,74 @@ INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_neon_dotprod)); #endif // HAVE_NEON_DOTPROD +#if HAVE_SVE +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_sve( + wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_sve_8, + wrap_convolve8_avg_horiz_sve_8, wrap_convolve8_vert_c_8, + wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, + wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8, + wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, + wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_sve( + wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, + wrap_convolve8_horiz_sve_10, wrap_convolve8_avg_horiz_sve_10, + wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, + wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10, + wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10, + 10); +const ConvolveFunctions convolve12_sve( + wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, + wrap_convolve8_horiz_sve_12, wrap_convolve8_avg_horiz_sve_12, + wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, + wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, + wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, + 12); + +const ConvolveParam kArrayConvolve_sve[] = { ALL_SIZES(convolve8_sve), + ALL_SIZES(convolve10_sve), + ALL_SIZES(convolve12_sve) }; +INSTANTIATE_TEST_SUITE_P(SVE, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_sve)); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SVE + +#if HAVE_SVE2 +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_sve2( + wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8, + wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_sve2_8, + wrap_convolve8_avg_vert_sve2_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, + wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8, + wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, + wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_sve2( + wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_sve2_10, + wrap_convolve8_avg_vert_sve2_10, wrap_convolve8_c_10, + wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10, + wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10, + 10); +const ConvolveFunctions convolve12_sve2( + wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_sve2_12, + wrap_convolve8_avg_vert_sve2_12, wrap_convolve8_c_12, + wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, + wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, + 12); + +const ConvolveParam kArrayConvolve_sve2[] = { ALL_SIZES(convolve8_sve2), + ALL_SIZES(convolve10_sve2), + ALL_SIZES(convolve12_sve2) }; +INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_sve2)); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SVE2 + #if HAVE_NEON_I8MM const ConvolveFunctions convolve8_neon_i8mm( vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm, diff --git a/media/libvpx/libvpx/test/encode_api_test.cc b/media/libvpx/libvpx/test/encode_api_test.cc index 508083673a..ca3b17a5d5 100644 --- a/media/libvpx/libvpx/test/encode_api_test.cc +++ b/media/libvpx/libvpx/test/encode_api_test.cc @@ -8,7 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <cassert> #include <climits> +#include <cstdint> #include <cstring> #include <initializer_list> #include <new> @@ -44,6 +46,49 @@ bool IsVP9(vpx_codec_iface_t *iface) { 0; } +void *Memset16(void *dest, int val, size_t length) { + uint16_t *dest16 = reinterpret_cast<uint16_t *>(dest); + for (size_t i = 0; i < length; i++) { + *dest16++ = val; + } + return dest; +} + +vpx_image_t *CreateImage(vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt, + unsigned int width, unsigned int height) { + assert(fmt != VPX_IMG_FMT_NV12); + if (bit_depth > VPX_BITS_8) { + fmt = static_cast<vpx_img_fmt_t>(fmt | VPX_IMG_FMT_HIGHBITDEPTH); + } + vpx_image_t *image = vpx_img_alloc(nullptr, fmt, width, height, 1); + if (!image) return image; + + const int val = 1 << (bit_depth - 1); + const unsigned int uv_h = + (image->d_h + image->y_chroma_shift) >> image->y_chroma_shift; + const unsigned int uv_w = + (image->d_w + image->x_chroma_shift) >> image->x_chroma_shift; + if (bit_depth > VPX_BITS_8) { + for (unsigned int i = 0; i < image->d_h; ++i) { + Memset16(image->planes[0] + i * image->stride[0], val, image->d_w); + } + for (unsigned int i = 0; i < uv_h; ++i) { + Memset16(image->planes[1] + i * image->stride[1], val, uv_w); + Memset16(image->planes[2] + i * image->stride[2], val, uv_w); + } + } else { + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], val, image->d_w); + } + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], val, uv_w); + memset(image->planes[2] + i * image->stride[2], val, uv_w); + } + } + + return image; +} + TEST(EncodeAPI, InvalidParams) { uint8_t buf[1] = { 0 }; vpx_image_t img; @@ -198,7 +243,51 @@ TEST(EncodeAPI, RandomPixelsVp8) { ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); // Generate random frame data and encode - uint8_t img[1280 * 720 * 3 / 2]; + libvpx_test::RandomVideoSource video; + video.SetSize(cfg.g_w, cfg.g_h); + video.SetImageFormat(VPX_IMG_FMT_I420); + video.Begin(); + ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(), + /*flags=*/0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + + // Destroy libvpx encoder + vpx_codec_destroy(&enc); +} + +TEST(EncodeAPI, ChangeToL1T3AndSetBitrateVp8) { + // Initialize libvpx encoder + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_threads = 1; + cfg.g_profile = 0; + cfg.g_w = 1; + cfg.g_h = 64; + cfg.g_bit_depth = VPX_BITS_8; + cfg.g_input_bit_depth = 8; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = 1000000; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_dropframe_thresh = 0; // Don't drop frames + cfg.rc_resize_allowed = 0; + cfg.rc_end_usage = VPX_VBR; + cfg.rc_target_bitrate = 10; + cfg.rc_min_quantizer = 2; + cfg.rc_max_quantizer = 58; + cfg.kf_mode = VPX_KF_AUTO; + cfg.kf_min_dist = 0; + cfg.kf_max_dist = 10000; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -6), VPX_CODEC_OK); + + // Generate random frame data and encode + uint8_t img[1 * 64 * 3 / 2]; libvpx_test::ACMRandom rng; for (size_t i = 0; i < sizeof(img); ++i) { img[i] = rng.Rand8(); @@ -207,13 +296,142 @@ TEST(EncodeAPI, RandomPixelsVp8) { ASSERT_EQ( vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img), &img_wrapper); - ASSERT_EQ(vpx_codec_encode(&enc, &img_wrapper, 0, 1, 0, VPX_DL_BEST_QUALITY), + vpx_enc_frame_flags_t flags = VPX_EFLAG_FORCE_KF; + ASSERT_EQ( + vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_encode(&enc, nullptr, -1, 0, 0, 0), VPX_CODEC_OK); + + cfg.rc_target_bitrate = 4294967; + // Set the scalability mode to L1T3. + cfg.ts_number_layers = 3; + cfg.ts_periodicity = 4; + cfg.ts_layer_id[0] = 0; + cfg.ts_layer_id[1] = 2; + cfg.ts_layer_id[2] = 1; + cfg.ts_layer_id[3] = 2; + cfg.ts_rate_decimator[0] = 4; + cfg.ts_rate_decimator[1] = 2; + cfg.ts_rate_decimator[2] = 1; + // Bitrate allocation L0: 50% L1: 20% L2: 30% + cfg.layer_target_bitrate[0] = cfg.ts_target_bitrate[0] = + 50 * cfg.rc_target_bitrate / 100; + cfg.layer_target_bitrate[1] = cfg.ts_target_bitrate[1] = + 70 * cfg.rc_target_bitrate / 100; + cfg.layer_target_bitrate[2] = cfg.ts_target_bitrate[2] = + cfg.rc_target_bitrate; + cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212; + cfg.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT; + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TEMPORAL_LAYER_ID, 2), VPX_CODEC_OK); + constexpr vpx_enc_frame_flags_t VP8_UPDATE_NOTHING = + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + // Layer 2: only reference last frame, no updates + // It only depends on layer 0 + flags = VP8_UPDATE_NOTHING | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_GF; + ASSERT_EQ( + vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME), + VPX_CODEC_OK); + // Destroy libvpx encoder vpx_codec_destroy(&enc); } -#endif + +// Emulates the WebCodecs VideoEncoder interface. +class VP8Encoder { + public: + explicit VP8Encoder(int speed) : speed_(speed) {} + ~VP8Encoder(); + + void Configure(unsigned int threads, unsigned int width, unsigned int height, + vpx_rc_mode end_usage, vpx_enc_deadline_t deadline); + void Encode(bool key_frame); + + private: + const int speed_; + bool initialized_ = false; + vpx_codec_enc_cfg_t cfg_; + vpx_codec_ctx_t enc_; + int frame_index_ = 0; + vpx_enc_deadline_t deadline_ = 0; +}; + +VP8Encoder::~VP8Encoder() { + if (initialized_) { + EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK); + } +} + +void VP8Encoder::Configure(unsigned int threads, unsigned int width, + unsigned int height, vpx_rc_mode end_usage, + vpx_enc_deadline_t deadline) { + deadline_ = deadline; + + if (!initialized_) { + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0), + VPX_CODEC_OK); + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.g_timebase.num = 1; + cfg_.g_timebase.den = 1000 * 1000; // microseconds + cfg_.g_pass = VPX_RC_ONE_PASS; + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = end_usage; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 58; + ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK); + initialized_ = true; + return; + } + + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.rc_end_usage = end_usage; + ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc_); +} + +void VP8Encoder::Encode(bool key_frame) { + const vpx_codec_cx_pkt_t *pkt; + vpx_image_t *image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg_.g_w, cfg_.g_h); + ASSERT_NE(image, nullptr); + const vpx_enc_frame_flags_t flags = key_frame ? VPX_EFLAG_FORCE_KF : 0; + ASSERT_EQ(vpx_codec_encode(&enc_, image, frame_index_, 1, flags, deadline_), + VPX_CODEC_OK); + ++frame_index_; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + if (key_frame) { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY); + } + } + vpx_img_free(image); +} + +// This is the reproducer testcase for crbug.com/324459561. However, +// just running this test is not enough to reproduce the bug. We also +// need to send signals to the test. +TEST(EncodeAPI, Chromium324459561) { + VP8Encoder encoder(-12); + + encoder.Configure(11, 1685, 652, VPX_CBR, VPX_DL_REALTIME); + + encoder.Encode(true); + encoder.Encode(true); + encoder.Encode(true); + + encoder.Configure(0, 1685, 1, VPX_VBR, VPX_DL_REALTIME); +} +#endif // CONFIG_VP8_ENCODER // Set up 2 spatial streams with 2 temporal layers per stream, and generate // invalid configuration by setting the temporal layer rate allocation @@ -499,6 +717,131 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) { } } +TEST(EncodeAPI, ConfigResizeBiggerAfterInit) { + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg; + vpx_codec_ctx_t enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg)); + + cfg.g_w = 1920; + cfg.g_h = 1; + EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg), + IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM); + + EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, ConfigResizeBiggerAfterEncode) { + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg; + vpx_codec_ctx_t enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg)); + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc)); + + cfg.g_w = 1920; + cfg.g_h = 1; + EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg), + IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM); + + cfg.g_w = 1920; + cfg.g_h = 1080; + EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg), + IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM); + + EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, PtsSmallerThanInitialPts) { + for (const auto *iface : kCodecIfaces) { + // Initialize libvpx encoder. + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(vpx_codec_encode(&enc, image, 12, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_encode(&enc, image, 13, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + // pts (10) is smaller than the initial pts (12). + ASSERT_EQ(vpx_codec_encode(&enc, image, 10, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, PtsOrDurationTooBig) { + for (const auto *iface : kCodecIfaces) { + // Initialize libvpx encoder. + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); +#if ULONG_MAX > INT64_MAX + // duration is too big. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, 2), + VPX_CODEC_INVALID_PARAM); +#endif + // pts, when converted to ticks, is too big. + ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000 + 1, 1, 0, + VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); +#if ULONG_MAX > INT64_MAX + // duration is too big. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); + // pts + duration is too big. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 1, INT64_MAX, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); +#endif + // pts + duration, when converted to ticks, is too big. +#if ULONG_MAX > INT64_MAX + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 0xbd6b566b15c7, 0, + VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); +#endif + ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000, 1, 0, + VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + #if CONFIG_VP9_ENCODER // Frame size needed to trigger the overflow exceeds the max buffer allowed on // 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY @@ -528,28 +871,16 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { } #endif // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 -vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) { - vpx_image_t *image = - vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1); - if (!image) return image; - - for (unsigned int i = 0; i < image->d_h; ++i) { - memset(image->planes[0] + i * image->stride[0], 128, image->d_w); - } - const unsigned int uv_h = (image->d_h + 1) / 2; - const unsigned int uv_w = (image->d_w + 1) / 2; - for (unsigned int i = 0; i < uv_h; ++i) { - memset(image->planes[1] + i * image->stride[1], 128, uv_w); - memset(image->planes[2] + i * image->stride[2], 128, uv_w); - } - - return image; -} - // Emulates the WebCodecs VideoEncoder interface. class VP9Encoder { public: - explicit VP9Encoder(int speed) : speed_(speed) {} + explicit VP9Encoder(int speed) + : speed_(speed), bit_depth_(VPX_BITS_8), fmt_(VPX_IMG_FMT_I420) {} + // The image format `fmt` must not have the VPX_IMG_FMT_HIGHBITDEPTH bit set. + // If bit_depth > 8, we will set the VPX_IMG_FMT_HIGHBITDEPTH bit before + // passing the image format to vpx_img_alloc(). + VP9Encoder(int speed, vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt) + : speed_(speed), bit_depth_(bit_depth), fmt_(fmt) {} ~VP9Encoder(); void Configure(unsigned int threads, unsigned int width, unsigned int height, @@ -558,6 +889,8 @@ class VP9Encoder { private: const int speed_; + const vpx_bit_depth_t bit_depth_; + const vpx_img_fmt_t fmt_; bool initialized_ = false; vpx_codec_enc_cfg_t cfg_; vpx_codec_ctx_t enc_; @@ -577,12 +910,22 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width, deadline_ = deadline; if (!initialized_) { + ASSERT_EQ(fmt_ & VPX_IMG_FMT_HIGHBITDEPTH, 0); + const bool high_bit_depth = bit_depth_ > VPX_BITS_8; + const bool is_420 = fmt_ == VPX_IMG_FMT_I420; vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0), VPX_CODEC_OK); cfg_.g_threads = threads; + // In profiles 0 and 2, only 4:2:0 format is allowed. In profiles 1 and 3, + // all other subsampling formats are allowed. In profiles 0 and 1, only bit + // depth 8 is allowed. In profiles 2 and 3, only bit depths 10 and 12 are + // allowed. + cfg_.g_profile = 2 * high_bit_depth + !is_420; cfg_.g_w = width; cfg_.g_h = height; + cfg_.g_bit_depth = bit_depth_; + cfg_.g_input_bit_depth = bit_depth_; cfg_.g_timebase.num = 1; cfg_.g_timebase.den = 1000 * 1000; // microseconds cfg_.g_pass = VPX_RC_ONE_PASS; @@ -590,7 +933,10 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width, cfg_.rc_end_usage = end_usage; cfg_.rc_min_quantizer = 2; cfg_.rc_max_quantizer = 58; - ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK); + ASSERT_EQ( + vpx_codec_enc_init(&enc_, iface, &cfg_, + high_bit_depth ? VPX_CODEC_USE_HIGHBITDEPTH : 0), + VPX_CODEC_OK); ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK); initialized_ = true; return; @@ -606,13 +952,13 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width, void VP9Encoder::Encode(bool key_frame) { const vpx_codec_cx_pkt_t *pkt; - vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h); + vpx_image_t *image = CreateImage(bit_depth_, fmt_, cfg_.g_w, cfg_.g_h); ASSERT_NE(image, nullptr); const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0; ASSERT_EQ( vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_), VPX_CODEC_OK); - frame_index_++; + ++frame_index_; vpx_codec_iter_t iter = nullptr; while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) { ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); @@ -944,6 +1290,28 @@ TEST(EncodeAPI, Buganizer311294795) { encoder.Encode(false); encoder.Encode(false); } + +TEST(EncodeAPI, Buganizer317105128) { + VP9Encoder encoder(-9); + encoder.Configure(0, 1, 1, VPX_CBR, VPX_DL_GOOD_QUALITY); + encoder.Configure(16, 1920, 1, VPX_CBR, VPX_DL_REALTIME); +} + +TEST(EncodeAPI, Buganizer319964497) { + VP9Encoder encoder(7); + encoder.Configure(/*threads=*/1, /*width=*/320, /*height=*/240, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Encode(/*key_frame=*/true); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/1, /*width=*/2, /*height=*/2, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); +} + #endif // CONFIG_VP9_ENCODER } // namespace diff --git a/media/libvpx/libvpx/test/frame_size_tests.cc b/media/libvpx/libvpx/test/frame_size_tests.cc index eea5647a78..6306e4f2ca 100644 --- a/media/libvpx/libvpx/test/frame_size_tests.cc +++ b/media/libvpx/libvpx/test/frame_size_tests.cc @@ -193,7 +193,7 @@ TEST_F(VP9FrameSizeTestsLarge, ValidSizes) { // size or almost 1 gig of memory. // In total the allocations will exceed 2GiB which may cause a failure with // mingw + wine, use a smaller size in that case. -#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__) +#if defined(_WIN32) && !defined(_WIN64) video.SetSize(4096, 3072); #else video.SetSize(4096, 4096); diff --git a/media/libvpx/libvpx/test/init_vpx_test.cc b/media/libvpx/libvpx/test/init_vpx_test.cc index f66f00b5c1..353c5043eb 100644 --- a/media/libvpx/libvpx/test/init_vpx_test.cc +++ b/media/libvpx/libvpx/test/init_vpx_test.cc @@ -57,6 +57,9 @@ void init_vpx_test() { if (!(caps & HAS_SVE)) { append_negative_gtest_filter(":SVE.*:SVE/*"); } + if (!(caps & HAS_SVE2)) { + append_negative_gtest_filter(":SVE2.*:SVE2/*"); + } #elif VPX_ARCH_ARM const int caps = arm_cpu_caps(); if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*"); diff --git a/media/libvpx/libvpx/test/resize_test.cc b/media/libvpx/libvpx/test/resize_test.cc index 20ad2229b4..f27bd7ebbc 100644 --- a/media/libvpx/libvpx/test/resize_test.cc +++ b/media/libvpx/libvpx/test/resize_test.cc @@ -7,8 +7,6 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include <stdio.h> - #include <climits> #include <vector> #include "third_party/googletest/src/include/gtest/gtest.h" @@ -598,6 +596,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) { mismatch_nframes_ = 0; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER unsigned int last_w = cfg_.g_w; unsigned int last_h = cfg_.g_h; int resize_count = 0; @@ -613,12 +612,12 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) { } } -#if CONFIG_VP9_DECODER // Verify that we get 1 resize down event in this test. ASSERT_EQ(1, resize_count) << "Resizing should occur."; EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames()); #else - printf("Warning: VP9 decoder unavailable, unable to check resize count!\n"); + GTEST_SKIP() + << "Warning: VP9 decoder unavailable, unable to check resize count!\n"; #endif } @@ -669,7 +668,8 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { ASSERT_EQ(resize_count, 4) << "Resizing should occur twice."; EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames()); #else - printf("Warning: VP9 decoder unavailable, unable to check resize count!\n"); + GTEST_SKIP() + << "Warning: VP9 decoder unavailable, unable to check resize count!\n"; #endif } diff --git a/media/libvpx/libvpx/test/sum_squares_test.cc b/media/libvpx/libvpx/test/sum_squares_test.cc index d3c76a34d2..57037f1e30 100644 --- a/media/libvpx/libvpx/test/sum_squares_test.cc +++ b/media/libvpx/libvpx/test/sum_squares_test.cc @@ -119,6 +119,13 @@ INSTANTIATE_TEST_SUITE_P( &vpx_sum_squares_2d_i16_neon))); #endif // HAVE_NEON +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_sve))); +#endif // HAVE_SVE + #if HAVE_SSE2 INSTANTIATE_TEST_SUITE_P( SSE2, SumSquaresTest, diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc index b8320e9ceb..5cf6a5fb8e 100644 --- a/media/libvpx/libvpx/test/variance_test.cc +++ b/media/libvpx/libvpx/test/variance_test.cc @@ -29,6 +29,9 @@ namespace { typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride); +typedef void (*GetVarianceFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int *sum); typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); using libvpx_test::ACMRandom; @@ -63,35 +66,65 @@ static unsigned int mb_ss_ref(const int16_t *src) { * Our codebase calculates the "diff" value in the variance algorithm by * (src - ref). */ -static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w, - int l2h, int src_stride, int ref_stride, - uint32_t *sse_ptr, bool use_high_bit_depth_, - vpx_bit_depth_t bit_depth) { - int64_t se = 0; - uint64_t sse = 0; - const int w = 1 << l2w; - const int h = 1 << l2h; +static void variance(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int w, int h, bool use_high_bit_depth_, + uint64_t *sse, int64_t *se, vpx_bit_depth_t bit_depth) { + int64_t se_long = 0; + uint64_t sse_long = 0; + for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - int diff; + int diff = 0; if (!use_high_bit_depth_) { diff = src[y * src_stride + x] - ref[y * ref_stride + x]; - se += diff; - sse += diff * diff; #if CONFIG_VP9_HIGHBITDEPTH } else { diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] - CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x]; - se += diff; - sse += diff * diff; #endif // CONFIG_VP9_HIGHBITDEPTH } + se_long += diff; + sse_long += diff * diff; } } - RoundHighBitDepth(bit_depth, &se, &sse); - *sse_ptr = static_cast<uint32_t>(sse); + + RoundHighBitDepth(bit_depth, &se_long, &sse_long); + + *sse = sse_long; + *se = se_long; +} + +static void get_variance_ref(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int l2w, + int l2h, bool use_high_bit_depth_, uint32_t *sse, + int *se, vpx_bit_depth_t bit_depth) { + const int w = 1 << l2w; + const int h = 1 << l2h; + int64_t se_long = 0; + uint64_t sse_long = 0; + + variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_, + &sse_long, &se_long, bit_depth); + + *sse = static_cast<uint32_t>(sse_long); + *se = static_cast<int>(se_long); +} + +static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w, + int l2h, int src_stride, int ref_stride, + uint32_t *sse_ptr, bool use_high_bit_depth_, + vpx_bit_depth_t bit_depth) { + const int w = 1 << l2w; + const int h = 1 << l2h; + int64_t se_long = 0; + uint64_t sse_long = 0; + + variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_, + &sse_long, &se_long, bit_depth); + + *sse_ptr = static_cast<uint32_t>(sse_long); return static_cast<uint32_t>( - sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h))); + sse_long - ((static_cast<int64_t>(se_long) * se_long) >> (l2w + l2h))); } /* The subpel reference functions differ from the codec version in one aspect: @@ -337,6 +370,9 @@ class MainTestClass void OneQuarterTest(); void SpeedTest(); + // GetVariance tests + void RefTestGetVar(); + // MSE/SSE tests void RefTestMse(); void RefTestSse(); @@ -493,6 +529,35 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() { } //////////////////////////////////////////////////////////////////////////////// +// Tests related to GetVariance. +template <typename GetVarianceFunctionType> +void MainTestClass<GetVarianceFunctionType>::RefTestGetVar() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size(); j++) { + if (!use_high_bit_depth()) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + unsigned int sse1, sse2; + int sum1, sum2; + const int stride = width(); + ASM_REGISTER_STATE_CHECK( + params_.func(src_, stride, ref_, stride, &sse1, &sum1)); + get_variance_ref(src_, stride, ref_, stride, params_.log2width, + params_.log2height, use_high_bit_depth(), &sse2, &sum2, + params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "Error at test index: " << i; + EXPECT_EQ(sum1, sum2) << "Error at test index: " << i; + } +} + +//////////////////////////////////////////////////////////////////////////////// // Tests related to MSE / SSE. template <typename FunctionType> @@ -766,6 +831,7 @@ void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() { typedef MainTestClass<Get4x4SseFunc> VpxSseTest; typedef MainTestClass<vpx_variance_fn_t> VpxMseTest; typedef MainTestClass<vpx_variance_fn_t> VpxVarianceTest; +typedef MainTestClass<GetVarianceFunc> VpxGetVarianceTest; typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxSubpelVarianceTest; typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxSubpelAvgVarianceTest; @@ -779,6 +845,7 @@ TEST_P(VpxVarianceTest, Ref) { RefTest(); } TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); } TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(VpxGetVarianceTest, RefGetVar) { RefTestGetVar(); } TEST_P(SumOfSquaresTest, Const) { ConstTest(); } TEST_P(SumOfSquaresTest, Ref) { RefTest(); } TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); } @@ -818,6 +885,16 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 3, &vpx_variance4x8_c), VarianceParams(2, 2, &vpx_variance4x4_c))); +typedef TestParams<GetVarianceFunc> GetVarianceParams; +INSTANTIATE_TEST_SUITE_P( + C, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_c), + GetVarianceParams(3, 3, &vpx_get8x8var_c), + GetVarianceParams(4, 4, &vpx_get16x16var_c), + GetVarianceParams(3, 3, &vpx_get8x8var_c), + GetVarianceParams(4, 4, &vpx_get16x16var_c), + GetVarianceParams(3, 3, &vpx_get8x8var_c))); + typedef TestParams<vpx_subpixvariance_fn_t> SubpelVarianceParams; INSTANTIATE_TEST_SUITE_P( C, VpxSubpelVarianceTest, @@ -856,6 +933,7 @@ INSTANTIATE_TEST_SUITE_P( #if CONFIG_VP9_HIGHBITDEPTH typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest; +typedef MainTestClass<GetVarianceFunc> VpxHBDGetVarianceTest; typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest; typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxHBDSubpelAvgVarianceTest; @@ -865,6 +943,7 @@ TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(VpxHBDGetVarianceTest, RefGetVar) { RefTestGetVar(); } TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } @@ -933,6 +1012,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 2, &vpx_highbd_8_variance4x4_c, 8))); INSTANTIATE_TEST_SUITE_P( + C, VpxHBDGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_c, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_c, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_c, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_c, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_c, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_c, 8))); + +INSTANTIATE_TEST_SUITE_P( C, VpxHBDSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8), @@ -1119,6 +1207,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 2, &vpx_variance4x4_sse2))); INSTANTIATE_TEST_SUITE_P( + SSE2, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_sse2), + GetVarianceParams(3, 3, &vpx_get8x8var_sse2), + GetVarianceParams(4, 4, &vpx_get16x16var_sse2), + GetVarianceParams(3, 3, &vpx_get8x8var_sse2), + GetVarianceParams(4, 4, &vpx_get16x16var_sse2), + GetVarianceParams(3, 3, &vpx_get8x8var_sse2))); + +INSTANTIATE_TEST_SUITE_P( SSE2, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0), @@ -1198,6 +1295,16 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sse2, 8))); INSTANTIATE_TEST_SUITE_P( + SSE2, VpxHBDGetVarianceTest, + ::testing::Values( + GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sse2, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sse2, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sse2, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sse2, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sse2, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sse2, 8))); + +INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, @@ -1475,6 +1582,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 3, &vpx_variance4x8_neon), VarianceParams(2, 2, &vpx_variance4x4_neon))); +INSTANTIATE_TEST_SUITE_P( + NEON, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon), + GetVarianceParams(3, 3, &vpx_get8x8var_neon), + GetVarianceParams(4, 4, &vpx_get16x16var_neon), + GetVarianceParams(3, 3, &vpx_get8x8var_neon), + GetVarianceParams(4, 4, &vpx_get16x16var_neon), + GetVarianceParams(3, 3, &vpx_get8x8var_neon))); + #if HAVE_NEON_DOTPROD INSTANTIATE_TEST_SUITE_P( NEON_DOTPROD, VpxSseTest, @@ -1502,6 +1618,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod), VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod), VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod))); + +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod), + GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod), + GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod), + GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod), + GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod), + GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod))); #endif // HAVE_NEON_DOTPROD INSTANTIATE_TEST_SUITE_P( @@ -1555,9 +1680,6 @@ INSTANTIATE_TEST_SUITE_P( MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8), MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8))); -// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can -// be used again. -#if 0 #if HAVE_NEON_DOTPROD INSTANTIATE_TEST_SUITE_P( NEON_DOTPROD, VpxHBDMseTest, @@ -1567,7 +1689,19 @@ INSTANTIATE_TEST_SUITE_P( MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8), MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8))); #endif // HAVE_NEON_DOTPROD -#endif // 0 + +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, VpxHBDMseTest, + ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sve, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_sve, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_sve, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_sve, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_sve, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_sve, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_sve, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_sve, VPX_BITS_10))); +#endif // HAVE_SVE INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDVarianceTest, @@ -1613,6 +1747,28 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8))); INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDGetVarianceTest, + ::testing::Values( + GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_neon, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_neon, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_neon, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_neon, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_neon, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_neon, 8))); + +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, VpxHBDGetVarianceTest, + ::testing::Values( + GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sve, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sve, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sve, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sve, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sve, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sve, 8))); +#endif // HAVE_SVE + +INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon, @@ -1815,6 +1971,53 @@ INSTANTIATE_TEST_SUITE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON +#if HAVE_SVE +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SVE, VpxHBDVarianceTest, + ::testing::Values( + VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sve, 12), + VarianceParams(6, 5, &vpx_highbd_12_variance64x32_sve, 12), + VarianceParams(5, 6, &vpx_highbd_12_variance32x64_sve, 12), + VarianceParams(5, 5, &vpx_highbd_12_variance32x32_sve, 12), + VarianceParams(5, 4, &vpx_highbd_12_variance32x16_sve, 12), + VarianceParams(4, 5, &vpx_highbd_12_variance16x32_sve, 12), + VarianceParams(4, 4, &vpx_highbd_12_variance16x16_sve, 12), + VarianceParams(4, 3, &vpx_highbd_12_variance16x8_sve, 12), + VarianceParams(3, 4, &vpx_highbd_12_variance8x16_sve, 12), + VarianceParams(3, 3, &vpx_highbd_12_variance8x8_sve, 12), + VarianceParams(3, 2, &vpx_highbd_12_variance8x4_sve, 12), + VarianceParams(2, 3, &vpx_highbd_12_variance4x8_sve, 12), + VarianceParams(2, 2, &vpx_highbd_12_variance4x4_sve, 12), + VarianceParams(6, 6, &vpx_highbd_10_variance64x64_sve, 10), + VarianceParams(6, 5, &vpx_highbd_10_variance64x32_sve, 10), + VarianceParams(5, 6, &vpx_highbd_10_variance32x64_sve, 10), + VarianceParams(5, 5, &vpx_highbd_10_variance32x32_sve, 10), + VarianceParams(5, 4, &vpx_highbd_10_variance32x16_sve, 10), + VarianceParams(4, 5, &vpx_highbd_10_variance16x32_sve, 10), + VarianceParams(4, 4, &vpx_highbd_10_variance16x16_sve, 10), + VarianceParams(4, 3, &vpx_highbd_10_variance16x8_sve, 10), + VarianceParams(3, 4, &vpx_highbd_10_variance8x16_sve, 10), + VarianceParams(3, 3, &vpx_highbd_10_variance8x8_sve, 10), + VarianceParams(3, 2, &vpx_highbd_10_variance8x4_sve, 10), + VarianceParams(2, 3, &vpx_highbd_10_variance4x8_sve, 10), + VarianceParams(2, 2, &vpx_highbd_10_variance4x4_sve, 10), + VarianceParams(6, 6, &vpx_highbd_8_variance64x64_sve, 8), + VarianceParams(6, 5, &vpx_highbd_8_variance64x32_sve, 8), + VarianceParams(5, 6, &vpx_highbd_8_variance32x64_sve, 8), + VarianceParams(5, 5, &vpx_highbd_8_variance32x32_sve, 8), + VarianceParams(5, 4, &vpx_highbd_8_variance32x16_sve, 8), + VarianceParams(4, 5, &vpx_highbd_8_variance16x32_sve, 8), + VarianceParams(4, 4, &vpx_highbd_8_variance16x16_sve, 8), + VarianceParams(4, 3, &vpx_highbd_8_variance16x8_sve, 8), + VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sve, 8), + VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sve, 8), + VarianceParams(3, 2, &vpx_highbd_8_variance8x4_sve, 8), + VarianceParams(2, 3, &vpx_highbd_8_variance4x8_sve, 8), + VarianceParams(2, 2, &vpx_highbd_8_variance4x4_sve, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SVE + #if HAVE_MSA INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest, ::testing::Values(vpx_get_mb_ss_msa)); @@ -1846,6 +2049,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(2, 2, &vpx_variance4x4_msa))); INSTANTIATE_TEST_SUITE_P( + MSA, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_msa), + GetVarianceParams(3, 3, &vpx_get8x8var_msa), + GetVarianceParams(4, 4, &vpx_get16x16var_msa), + GetVarianceParams(3, 3, &vpx_get8x8var_msa), + GetVarianceParams(4, 4, &vpx_get16x16var_msa), + GetVarianceParams(3, 3, &vpx_get8x8var_msa))); + +INSTANTIATE_TEST_SUITE_P( MSA, VpxSubpelVarianceTest, ::testing::Values( SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0), @@ -1908,6 +2120,15 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(3, 2, &vpx_variance8x4_vsx), VarianceParams(2, 3, &vpx_variance4x8_vsx), VarianceParams(2, 2, &vpx_variance4x4_vsx))); + +INSTANTIATE_TEST_SUITE_P( + VSX, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_vsx), + GetVarianceParams(3, 3, &vpx_get8x8var_vsx), + GetVarianceParams(4, 4, &vpx_get16x16var_vsx), + GetVarianceParams(3, 3, &vpx_get8x8var_vsx), + GetVarianceParams(4, 4, &vpx_get16x16var_vsx), + GetVarianceParams(3, 3, &vpx_get8x8var_vsx))); #endif // HAVE_VSX #if HAVE_MMI diff --git a/media/libvpx/libvpx/test/video_source.h b/media/libvpx/libvpx/test/video_source.h index 2194126f1f..2c035910db 100644 --- a/media/libvpx/libvpx/test/video_source.h +++ b/media/libvpx/libvpx/test/video_source.h @@ -236,7 +236,6 @@ class RandomVideoSource : public DummyVideoSource { RandomVideoSource(int seed = ACMRandom::DeterministicSeed()) : rnd_(seed), seed_(seed) {} - protected: // Reset the RNG to get a matching stream for the second pass void Begin() override { frame_ = 0; @@ -244,6 +243,7 @@ class RandomVideoSource : public DummyVideoSource { FillFrame(); } + protected: // 15 frames of noise, followed by 15 static frames. Reset to 0 rather // than holding previous frames to encourage keyframes to be thrown. void FillFrame() override { diff --git a/media/libvpx/libvpx/test/vp8_datarate_test.cc b/media/libvpx/libvpx/test/vp8_datarate_test.cc index aee27af66e..d47ed298fe 100644 --- a/media/libvpx/libvpx/test/vp8_datarate_test.cc +++ b/media/libvpx/libvpx/test/vp8_datarate_test.cc @@ -14,7 +14,7 @@ #include "test/i420_video_source.h" #include "test/util.h" #include "test/y4m_video_source.h" -#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" namespace { @@ -260,6 +260,27 @@ class DatarateTestLarge << " The datarate for the file missed the target!"; } + virtual void MultiThreadsPSNRTest() { + denoiser_on_ = 0; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_threads = 4; + init_flags_ = VPX_CODEC_USE_PSNR; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, 30); + cfg_.rc_target_bitrate = 1000; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.5) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 2.0) + << " The datarate for the file missed the target!"; + } + vpx_codec_pts_t last_pts_; int64_t bits_in_buffer_model_; double timebase_; @@ -324,6 +345,8 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { DropFramesMultiThreadsTest(); } +TEST_P(DatarateTestRealTime, MultiThreadsPSNR) { MultiThreadsPSNRTest(); } + TEST_P(DatarateTestRealTime, RegionOfInterest) { denoiser_on_ = 0; cfg_.rc_buf_initial_sz = 500; diff --git a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc index 50478f7635..d87fef5a46 100644 --- a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc +++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc @@ -149,9 +149,16 @@ class Vp8RcInterfaceTest return; } int qp; + libvpx::UVDeltaQP uv_delta_qp; encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) { ASSERT_EQ(rc_api_->GetQP(), qp); + uv_delta_qp = rc_api_->GetUVDeltaQP(); + // delta_qp for UV channel is only set for screen. + if (!rc_cfg_.is_screen) { + ASSERT_EQ(uv_delta_qp.uvdc_delta_q, 0); + ASSERT_EQ(uv_delta_qp.uvac_delta_q, 0); + } } else { num_drops_++; } diff --git a/media/libvpx/libvpx/test/vp9_block_error_test.cc b/media/libvpx/libvpx/test/vp9_block_error_test.cc index 0645341ac1..c5ddcd58ab 100644 --- a/media/libvpx/libvpx/test/vp9_block_error_test.cc +++ b/media/libvpx/libvpx/test/vp9_block_error_test.cc @@ -215,4 +215,13 @@ const BlockErrorParam neon_block_error_tests[] = { INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest, ::testing::ValuesIn(neon_block_error_tests)); #endif // HAVE_NEON + +#if HAVE_SVE +const BlockErrorParam sve_block_error_tests[] = { make_tuple( + &BlockError8BitWrapper<vp9_block_error_sve>, + &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8) }; + +INSTANTIATE_TEST_SUITE_P(SVE, BlockErrorTest, + ::testing::ValuesIn(sve_block_error_tests)); +#endif // HAVE_SVE } // namespace diff --git a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc index 33fa05c65c..5c23a5b0d5 100644 --- a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc +++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc @@ -10,115 +10,78 @@ #include <cstdint> #include <new> +#include <memory> + +#include "./vpx_config.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/util.h" #include "test/yuv_video_source.h" #include "third_party/googletest/src/include/gtest/gtest.h" +#if CONFIG_VP9_DECODER +#include "vpx/vp8dx.h" +#endif #include "vp9/simple_encode.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" #include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_image.h" #include "vpx/vpx_tpl.h" #include "vpx_dsp/vpx_dsp_common.h" namespace { -constexpr int kModelMagicNumber = 51396; -constexpr uintptr_t PrivMagicNumber = 5566; -constexpr int kFrameNum = 5; -constexpr int kFrameNumGOP = 30; -constexpr int kFrameNumGOPShort = 4; -constexpr int kLosslessCodingIndex = 2; -constexpr int kFixedGOPSize = 9; -// The range check in vp9_cx_iface.c shows that the max -// lag in buffer is MAX_LAG_BUFFERS (25): -// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); -constexpr int kMaxLagInFrames = 25; -constexpr int kDefaultMinGfInterval = 4; -constexpr int kDefaultMaxGfInterval = 16; -// The active gf interval might change for each GOP -// See function "get_active_gf_inverval_range". -// The numbers below are from manual inspection. -constexpr int kReadMinGfInterval = 5; -constexpr int kReadMaxGfInterval = 13; -const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv"; -const double kPsnrThreshold = 30.4; - -struct ToyRateCtrl { - int magic_number; - int coding_index; - - int gop_global_index; - int frames_since_key; - int show_index; +constexpr int kFrameNum = 10; +constexpr int kFixedGOPSize = 10; +constexpr int kKeyframeQp = 10; +constexpr int kLeafQp = 40; +constexpr int kArfQp = 15; + +// Simple external rate controller for testing. +class RateControllerForTest { + public: + RateControllerForTest() : current_gop_(-1) {} + ~RateControllerForTest() {} + + void StartNextGop() { ++current_gop_; } + + vpx_rc_gop_decision_t GetCurrentGop() const { + vpx_rc_gop_decision_t gop_decision; + gop_decision.use_key_frame = current_gop_ == 0 ? 1 : 0; + gop_decision.use_alt_ref = 1; + gop_decision.gop_coding_frames = kFixedGOPSize; + return gop_decision; + } + + int CalculateFrameDecision(int frame_index) { + EXPECT_LE(frame_index, kFixedGOPSize); + if (current_gop_ == 0 && frame_index == 0) { + // Key frame, first frame in the first GOP. + return kKeyframeQp; + } else if (frame_index == 1) { + // ARF, we always use ARF for this test. + return kArfQp; + } else { + return kLeafQp; + } + } + int current_gop_; }; -vpx_rc_status_t rc_create_model(void *priv, - const vpx_rc_config_t *ratectrl_config, - vpx_rc_model_t *rate_ctrl_model_ptr) { - ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; - if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; - toy_rate_ctrl->magic_number = kModelMagicNumber; - toy_rate_ctrl->coding_index = -1; - *rate_ctrl_model_ptr = toy_rate_ctrl; - EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber)); - EXPECT_EQ(ratectrl_config->frame_width, 352); - EXPECT_EQ(ratectrl_config->frame_height, 288); - EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNum); - EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000); - EXPECT_EQ(ratectrl_config->frame_rate_num, 30); - EXPECT_EQ(ratectrl_config->frame_rate_den, 1); - return VPX_RC_OK; -} - -vpx_rc_status_t rc_create_model_gop(void *priv, - const vpx_rc_config_t *ratectrl_config, - vpx_rc_model_t *rate_ctrl_model_ptr) { - ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; - if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; - toy_rate_ctrl->magic_number = kModelMagicNumber; - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - toy_rate_ctrl->show_index = 0; - toy_rate_ctrl->coding_index = 0; - *rate_ctrl_model_ptr = toy_rate_ctrl; - EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber)); - EXPECT_EQ(ratectrl_config->frame_width, 640); - EXPECT_EQ(ratectrl_config->frame_height, 360); - EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP); - EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000); - EXPECT_EQ(ratectrl_config->frame_rate_num, 30); - EXPECT_EQ(ratectrl_config->frame_rate_den, 1); - return VPX_RC_OK; -} - -vpx_rc_status_t rc_create_model_gop_short( - void *priv, const vpx_rc_config_t *ratectrl_config, +// Callbacks used in this test. +vpx_rc_status_t rc_test_create_model( + void * /*priv*/, const vpx_rc_config_t * /*ratectrl_config*/, vpx_rc_model_t *rate_ctrl_model_ptr) { - ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; - if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; - toy_rate_ctrl->magic_number = kModelMagicNumber; - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - toy_rate_ctrl->show_index = 0; - toy_rate_ctrl->coding_index = 0; - *rate_ctrl_model_ptr = toy_rate_ctrl; - EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber)); - EXPECT_EQ(ratectrl_config->frame_width, 352); - EXPECT_EQ(ratectrl_config->frame_height, 288); - EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort); - EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500); - EXPECT_EQ(ratectrl_config->frame_rate_num, 30); - EXPECT_EQ(ratectrl_config->frame_rate_den, 1); + std::unique_ptr<RateControllerForTest> test_controller( + new RateControllerForTest()); + *rate_ctrl_model_ptr = test_controller.release(); return VPX_RC_OK; } -vpx_rc_status_t rc_send_firstpass_stats( - vpx_rc_model_t rate_ctrl_model, +vpx_rc_status_t rc_test_send_firstpass_stats( + vpx_rc_model_t /*rate_ctrl_model*/, const vpx_rc_firstpass_stats_t *first_pass_stats) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); EXPECT_EQ(first_pass_stats->num_frames, kFrameNum); for (int i = 0; i < first_pass_stats->num_frames; ++i) { EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); @@ -126,37 +89,8 @@ vpx_rc_status_t rc_send_firstpass_stats( return VPX_RC_OK; } -vpx_rc_status_t rc_send_firstpass_stats_gop( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_firstpass_stats_t *first_pass_stats) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP); - for (int i = 0; i < first_pass_stats->num_frames; ++i) { - EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); - } - return VPX_RC_OK; -} - -vpx_rc_status_t rc_send_firstpass_stats_gop_short( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_firstpass_stats_t *first_pass_stats) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort); - for (int i = 0; i < first_pass_stats->num_frames; ++i) { - EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); - } - return VPX_RC_OK; -} - -vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model, - const VpxTplGopStats *tpl_gop_stats) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); +vpx_rc_status_t rc_test_send_tpl_gop_stats( + vpx_rc_model_t /*rate_ctrl_model*/, const VpxTplGopStats *tpl_gop_stats) { EXPECT_GT(tpl_gop_stats->size, 0); for (int i = 0; i < tpl_gop_stats->size; ++i) { @@ -165,522 +99,38 @@ vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model, return VPX_RC_OK; } -vpx_rc_status_t rc_get_encodeframe_decision( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, +vpx_rc_status_t rc_test_get_encodeframe_decision( + vpx_rc_model_t rate_ctrl_model, const int frame_gop_index, vpx_rc_encodeframe_decision_t *frame_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - toy_rate_ctrl->coding_index += 1; - - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - - EXPECT_LT(encode_frame_info->show_index, kFrameNum); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - if (encode_frame_info->coding_index == 0) { - EXPECT_EQ(encode_frame_info->show_index, 0); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - } else if (encode_frame_info->coding_index == 1) { - EXPECT_EQ(encode_frame_info->show_index, 4); - EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 0); // kRefFrameTypeLast - } else if (encode_frame_info->coding_index >= 2 && - encode_frame_info->coding_index < 5) { - // In the first group of pictures, coding_index and gop_index are equal. - EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - } else if (encode_frame_info->coding_index == 5) { - EXPECT_EQ(encode_frame_info->show_index, 4); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 1); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 1); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 4); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[2], - 1); // kRefFrameTypeFuture - } - if (encode_frame_info->coding_index == kLosslessCodingIndex) { - // We should get sse == 0 at rc_update_encodeframe_result() - frame_decision->q_index = 0; - } else { - frame_decision->q_index = 100; - } - frame_decision->max_frame_size = 0; + RateControllerForTest *test_controller = + static_cast<RateControllerForTest *>(rate_ctrl_model); + frame_decision->q_index = + test_controller->CalculateFrameDecision(frame_gop_index); return VPX_RC_OK; } -vpx_rc_status_t rc_get_encodeframe_decision_gop( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, - vpx_rc_encodeframe_decision_t *frame_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - if (encode_frame_info->coding_index == 0) { - EXPECT_EQ(encode_frame_info->show_index, 0); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - } else if (encode_frame_info->coding_index == 1) { - EXPECT_EQ(encode_frame_info->show_index, 1); - EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 0); // kRefFrameTypeLast - } else if (encode_frame_info->coding_index == 2) { - EXPECT_EQ(encode_frame_info->show_index, 2); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - } else if (encode_frame_info->coding_index == 3 || - encode_frame_info->coding_index == 12 || - encode_frame_info->coding_index == 21) { - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); - EXPECT_EQ(encode_frame_info->gop_index, 1); - } else if (encode_frame_info->coding_index == 11 || - encode_frame_info->coding_index == 20 || - encode_frame_info->coding_index == 29) { - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); - EXPECT_EQ(encode_frame_info->gop_index, 0); - } else if (encode_frame_info->coding_index >= 30) { - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - } - - // When the model recommends an invalid q, valid range [0, 255], - // the encoder will ignore it and use the default q selected - // by libvpx rate control strategy. - frame_decision->q_index = VPX_DEFAULT_Q; - frame_decision->max_frame_size = 0; - - toy_rate_ctrl->coding_index += 1; - return VPX_RC_OK; -} - -vpx_rc_status_t rc_get_encodeframe_decision_gop_short( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, - vpx_rc_encodeframe_decision_t *frame_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - if (encode_frame_info->coding_index == 0) { - EXPECT_EQ(encode_frame_info->show_index, 0); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 1) { - EXPECT_EQ(encode_frame_info->show_index, 1); - EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 2) { - EXPECT_EQ(encode_frame_info->show_index, 2); - EXPECT_EQ(encode_frame_info->gop_index, 2); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 3) { - EXPECT_EQ(encode_frame_info->show_index, 3); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); - } - - // When the model recommends an invalid q, valid range [0, 255], - // the encoder will ignore it and use the default q selected - // by libvpx rate control strategy. - frame_decision->q_index = VPX_DEFAULT_Q; - frame_decision->max_frame_size = 0; - - toy_rate_ctrl->coding_index += 1; - return VPX_RC_OK; -} - -vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, - vpx_rc_encodeframe_decision_t *frame_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - if (encode_frame_info->coding_index == 0) { - EXPECT_EQ(encode_frame_info->show_index, 0); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 1) { - EXPECT_EQ(encode_frame_info->show_index, 3); - EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 2) { - EXPECT_EQ(encode_frame_info->show_index, 1); - EXPECT_EQ(encode_frame_info->gop_index, 2); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 3) { - EXPECT_EQ(encode_frame_info->show_index, 2); - EXPECT_EQ(encode_frame_info->gop_index, 3); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 4) { - EXPECT_EQ(encode_frame_info->show_index, 3); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } - - // When the model recommends an invalid q, valid range [0, 255], - // the encoder will ignore it and use the default q selected - // by libvpx rate control strategy. - frame_decision->q_index = VPX_DEFAULT_Q; - frame_decision->max_frame_size = 0; - - toy_rate_ctrl->coding_index += 1; - return VPX_RC_OK; -} - -vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, - vpx_rc_encodeframe_decision_t *frame_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - if (encode_frame_info->coding_index == 0) { - EXPECT_EQ(encode_frame_info->show_index, 0); - EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 1) { - EXPECT_EQ(encode_frame_info->show_index, 1); - EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], - 1); // kRefFrameTypeLast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], - 0); // kRefFrameTypePast - EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], - 0); // kRefFrameTypeFuture - EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], - 0); // kRefFrameTypeLast - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 2) { - EXPECT_EQ(encode_frame_info->show_index, 2); - EXPECT_EQ(encode_frame_info->gop_index, 2); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } else if (encode_frame_info->coding_index == 3) { - EXPECT_EQ(encode_frame_info->show_index, 3); - EXPECT_EQ(encode_frame_info->gop_index, 3); - EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); - EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); - } - - // When the model recommends an invalid q, valid range [0, 255], - // the encoder will ignore it and use the default q selected - // by libvpx rate control strategy. - frame_decision->q_index = VPX_DEFAULT_Q; - frame_decision->max_frame_size = 0; - - toy_rate_ctrl->coding_index += 1; - return VPX_RC_OK; -} - -vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, - const vpx_rc_gop_info_t *gop_info, - vpx_rc_gop_decision_t *gop_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); - EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); - EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval); - EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval); - EXPECT_EQ(gop_info->allow_alt_ref, 1); - if (gop_info->is_key_frame) { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - EXPECT_EQ(gop_info->frames_since_key, 0); - EXPECT_EQ(gop_info->gop_global_index, 0); - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - } else { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); - } - EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); - EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); - EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); - EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); - - gop_decision->gop_coding_frames = - VPXMIN(kFixedGOPSize, gop_info->frames_to_key); - gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize; - toy_rate_ctrl->frames_since_key += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - toy_rate_ctrl->show_index += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - ++toy_rate_ctrl->gop_global_index; - return VPX_RC_OK; -} - -// Test on a 4 frame video. -// Test a setting of 2 GOPs. -// The first GOP has 3 coding frames, no alt ref. -// The second GOP has 1 coding frame, no alt ref. -vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model, - const vpx_rc_gop_info_t *gop_info, - vpx_rc_gop_decision_t *gop_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); - EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); - EXPECT_EQ(gop_info->allow_alt_ref, 1); - if (gop_info->is_key_frame) { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - EXPECT_EQ(gop_info->frames_since_key, 0); - EXPECT_EQ(gop_info->gop_global_index, 0); - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - } else { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - } - EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); - EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); - EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); - EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); - - gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1; - gop_decision->use_alt_ref = 0; - toy_rate_ctrl->frames_since_key += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - toy_rate_ctrl->show_index += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - ++toy_rate_ctrl->gop_global_index; - return VPX_RC_OK; -} - -// Test on a 4 frame video. -// Test a setting of 2 GOPs. -// The first GOP has 4 coding frames. Use alt ref. -// The second GOP only contains the overlay frame of the first GOP's alt ref -// frame. -vpx_rc_status_t rc_get_gop_decision_short_overlay( - vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, - vpx_rc_gop_decision_t *gop_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); - EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); - EXPECT_EQ(gop_info->allow_alt_ref, 1); - if (gop_info->is_key_frame) { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - EXPECT_EQ(gop_info->frames_since_key, 0); - EXPECT_EQ(gop_info->gop_global_index, 0); - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - } else { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); - } - EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); - EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); - EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); - EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); - - gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1; - gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0; - toy_rate_ctrl->frames_since_key += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - toy_rate_ctrl->show_index += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - ++toy_rate_ctrl->gop_global_index; - return VPX_RC_OK; -} - -// Test on a 4 frame video. -// Test a setting of 1 GOP. -// The GOP has 4 coding frames. Do not use alt ref. -vpx_rc_status_t rc_get_gop_decision_short_no_arf( - vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, - vpx_rc_gop_decision_t *gop_decision) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); - EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); - EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); - EXPECT_EQ(gop_info->allow_alt_ref, 1); - if (gop_info->is_key_frame) { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - EXPECT_EQ(gop_info->frames_since_key, 0); - EXPECT_EQ(gop_info->gop_global_index, 0); - toy_rate_ctrl->gop_global_index = 0; - toy_rate_ctrl->frames_since_key = 0; - } else { - EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); - } - EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); - EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); - EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); - EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); - - gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1; - gop_decision->use_alt_ref = 0; - toy_rate_ctrl->frames_since_key += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - toy_rate_ctrl->show_index += - gop_decision->gop_coding_frames - gop_decision->use_alt_ref; - ++toy_rate_ctrl->gop_global_index; - return VPX_RC_OK; -} - -vpx_rc_status_t rc_update_encodeframe_result( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_result_t *encode_frame_result) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - - const int64_t ref_pixel_count = 352 * 288 * 3 / 2; - EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); - if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { - EXPECT_EQ(encode_frame_result->sse, 0); - } - if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) { - EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0); - } else { - EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100); - } - return VPX_RC_OK; -} - -vpx_rc_status_t rc_update_encodeframe_result_gop( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_result_t *encode_frame_result) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - - const int64_t ref_pixel_count = 640 * 360 * 3 / 2; - EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); - return VPX_RC_OK; -} - -vpx_rc_status_t rc_update_encodeframe_result_gop_short( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_result_t *encode_frame_result) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - - const int64_t ref_pixel_count = 352 * 288 * 3 / 2; - EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); - return VPX_RC_OK; -} - -vpx_rc_status_t rc_get_default_frame_rdmult( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) { - const ToyRateCtrl *toy_rate_ctrl = - static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); - EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); - - *rdmult = VPX_DEFAULT_RDMULT; +vpx_rc_status_t rc_test_get_gop_decision(vpx_rc_model_t rate_ctrl_model, + vpx_rc_gop_decision_t *gop_decision) { + RateControllerForTest *test_controller = + static_cast<RateControllerForTest *>(rate_ctrl_model); + test_controller->StartNextGop(); + *gop_decision = test_controller->GetCurrentGop(); return VPX_RC_OK; } vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) { - ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); - EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); - delete toy_rate_ctrl; + RateControllerForTest *test_controller = + static_cast<RateControllerForTest *>(rate_ctrl_model); + delete test_controller; return VPX_RC_OK; } class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, public ::testing::Test { protected: - ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {} + ExtRateCtrlTest() + : EncoderTest(&::libvpx_test::kVP9), frame_number_(0), + current_frame_qp_(0) {} ~ExtRateCtrlTest() override = default; @@ -693,287 +143,62 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { vpx_rc_funcs_t rc_funcs = {}; - rc_funcs.rc_type = VPX_RC_QP; - rc_funcs.create_model = rc_create_model; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats; - rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision; - rc_funcs.update_encodeframe_result = rc_update_encodeframe_result; - rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); - encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); - } - } -}; - -TEST_F(ExtRateCtrlTest, EncodeTest) { - cfg_.rc_target_bitrate = 24000; - - std::unique_ptr<libvpx_test::VideoSource> video; - video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, - kFrameNum)); - - ASSERT_NE(video, nullptr); - ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); -} - -class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam<int> { - protected: - ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {} - - ~ExtRateCtrlTestGOP() override = default; - - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); - } - - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); - encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - - vpx_rc_funcs_t rc_funcs = {}; - rc_funcs.rc_type = VPX_RC_GOP_QP; - rc_funcs.create_model = rc_create_model_gop; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop; - rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats; - rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop; - rc_funcs.get_gop_decision = rc_get_gop_decision; - rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop; - rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); - encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); - } - } -}; - -TEST_F(ExtRateCtrlTestGOP, EncodeTest) { - cfg_.rc_target_bitrate = 4000; - cfg_.g_lag_in_frames = kMaxLagInFrames; - cfg_.rc_end_usage = VPX_VBR; - - std::unique_ptr<libvpx_test::VideoSource> video; - video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0, - kFrameNumGOP)); - - ASSERT_NE(video, nullptr); - ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); -} - -class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam<int> { - protected: - ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {} - - ~ExtRateCtrlTestGOPShort() override = default; - - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); - } - - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); - encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - - vpx_rc_funcs_t rc_funcs = {}; - rc_funcs.rc_type = VPX_RC_GOP_QP; - rc_funcs.create_model = rc_create_model_gop_short; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; - rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short; - rc_funcs.get_gop_decision = rc_get_gop_decision_short; - rc_funcs.update_encodeframe_result = - rc_update_encodeframe_result_gop_short; - rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); - encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); - } - } -}; - -TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) { - cfg_.rc_target_bitrate = 500; - cfg_.g_lag_in_frames = kMaxLagInFrames - 1; - cfg_.rc_end_usage = VPX_VBR; - - std::unique_ptr<libvpx_test::VideoSource> video; - video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); - - ASSERT_NE(video, nullptr); - ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); -} - -class ExtRateCtrlTestGOPShortOverlay - : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam<int> { - protected: - ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {} - - ~ExtRateCtrlTestGOPShortOverlay() override = default; - - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); - } - - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); - encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - - vpx_rc_funcs_t rc_funcs = {}; rc_funcs.rc_type = VPX_RC_GOP_QP; - rc_funcs.create_model = rc_create_model_gop_short; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; - rc_funcs.get_encodeframe_decision = - rc_get_encodeframe_decision_gop_short_overlay; - rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay; - rc_funcs.update_encodeframe_result = - rc_update_encodeframe_result_gop_short; + rc_funcs.create_model = rc_test_create_model; + rc_funcs.send_firstpass_stats = rc_test_send_firstpass_stats; + rc_funcs.send_tpl_gop_stats = rc_test_send_tpl_gop_stats; + rc_funcs.get_gop_decision = rc_test_get_gop_decision; + rc_funcs.get_encodeframe_decision = rc_test_get_encodeframe_decision; rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); } } -}; - -TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) { - cfg_.rc_target_bitrate = 500; - cfg_.g_lag_in_frames = kMaxLagInFrames - 1; - cfg_.rc_end_usage = VPX_VBR; - - std::unique_ptr<libvpx_test::VideoSource> video; - video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); - - ASSERT_NE(video, nullptr); - ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); -} - -class ExtRateCtrlTestGOPShortNoARF - : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam<int> { - protected: - ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {} - - ~ExtRateCtrlTestGOPShortNoARF() override = default; - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); +#if CONFIG_VP9_DECODER + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const ::libvpx_test::VideoSource & /*video*/, + ::libvpx_test::Decoder *decoder) override { + EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); + decoder->Control(VPXD_GET_LAST_QUANTIZER, ¤t_frame_qp_); + return VPX_CODEC_OK == res_dec; } - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); - encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); - encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); - - vpx_rc_funcs_t rc_funcs = {}; - rc_funcs.rc_type = VPX_RC_GOP_QP; - rc_funcs.create_model = rc_create_model_gop_short; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; - rc_funcs.get_encodeframe_decision = - rc_get_encodeframe_decision_gop_short_no_arf; - rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf; - rc_funcs.update_encodeframe_result = - rc_update_encodeframe_result_gop_short; - rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); - encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + if (frame_number_ == 0) { + // This must be a key frame + EXPECT_TRUE((pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0); + EXPECT_EQ(current_frame_qp_, kKeyframeQp); + ++frame_number_; + return; } - } -}; - -TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) { - cfg_.rc_target_bitrate = 500; - cfg_.g_lag_in_frames = kMaxLagInFrames - 1; - cfg_.rc_end_usage = VPX_VBR; - - std::unique_ptr<libvpx_test::VideoSource> video; - video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); - - ASSERT_NE(video, nullptr); - ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); -} - -class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest, - public ::testing::Test { - protected: - ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {} - - ~ExtRateCtrlTestRdmult() override = default; - - void SetUp() override { - InitializeConfig(); - SetMode(::libvpx_test::kTwoPassGood); - } - - void BeginPassHook(unsigned int) override { - psnr_ = 0.0; - nframes_ = 0; - } - - void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { - psnr_ += pkt->data.psnr.psnr[0]; - nframes_++; - } - void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) override { - if (video->frame() == 0) { - vpx_rc_funcs_t rc_funcs = {}; - rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT; - rc_funcs.create_model = rc_create_model_gop_short; - rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; - rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short; - rc_funcs.get_gop_decision = rc_get_gop_decision_short; - rc_funcs.update_encodeframe_result = - rc_update_encodeframe_result_gop_short; - rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult; - rc_funcs.delete_model = rc_delete_model; - rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); - encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + if ((pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) != 0) { + // This is ARF + EXPECT_EQ(current_frame_qp_, kArfQp); + ++frame_number_; + return; } - } - double GetAveragePsnr() const { - if (nframes_) return psnr_ / nframes_; - return 0.0; + EXPECT_EQ(current_frame_qp_, kLeafQp); + ++frame_number_; } +#endif // CONFIG_VP9_DECODER - private: - double psnr_; - unsigned int nframes_; + int frame_number_; + int current_frame_qp_; }; -TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) { - cfg_.rc_target_bitrate = 500; - cfg_.g_lag_in_frames = kMaxLagInFrames - 1; - cfg_.rc_end_usage = VPX_VBR; - init_flags_ = VPX_CODEC_USE_PSNR; +TEST_F(ExtRateCtrlTest, EncodeTest) { + cfg_.rc_target_bitrate = 4000; + cfg_.g_lag_in_frames = 25; std::unique_ptr<libvpx_test::VideoSource> video; video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( - kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, + kFrameNum)); ASSERT_NE(video, nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); - - const double psnr = GetAveragePsnr(); - EXPECT_GT(psnr, kPsnrThreshold); } } // namespace diff --git a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc index f7be47542c..a6c7563348 100644 --- a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc +++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc @@ -9,6 +9,7 @@ */ #include "vp9/ratectrl_rtc.h" +#include <climits> #include <fstream> // NOLINT #include <string> @@ -19,6 +20,8 @@ #include "test/i420_video_source.h" #include "test/util.h" #include "test/video_source.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_svc_layercontext.h" #include "vpx/vpx_codec.h" #include "vpx_ports/bitops.h" diff --git a/media/libvpx/libvpx/test/vp9_scale_test.cc b/media/libvpx/libvpx/test/vp9_scale_test.cc index 049a10a617..a5a18a7e9d 100644 --- a/media/libvpx/libvpx/test/vp9_scale_test.cc +++ b/media/libvpx/libvpx/test/vp9_scale_test.cc @@ -48,12 +48,11 @@ class ScaleTest : public VpxScaleBase, } void RunTest(INTERP_FILTER filter_type) { - static const int kNumSizesToTest = 20; + static const int kNumSizesToTest = 22; static const int kNumScaleFactorsToTest = 4; - static const int kSizesToTest[] = { - 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, - 22, 24, 26, 28, 30, 32, 34, 68, 128, 134 - }; + static const int kSizesToTest[] = { 1, 2, 3, 4, 6, 8, 10, 12, + 14, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 68, 128, 134 }; static const int kScaleFactors[] = { 1, 2, 3, 4 }; for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) { for (int h = 0; h < kNumSizesToTest; ++h) { diff --git a/media/libvpx/libvpx/tools_common.c b/media/libvpx/libvpx/tools_common.c index 5c13781513..5af971f720 100644 --- a/media/libvpx/libvpx/tools_common.c +++ b/media/libvpx/libvpx/tools_common.c @@ -26,15 +26,9 @@ #include "vpx/vpx_codec.h" -#if defined(_WIN32) || defined(__OS2__) +#if defined(_WIN32) #include <io.h> #include <fcntl.h> - -#ifdef __OS2__ -#define _setmode setmode -#define _fileno fileno -#define _O_BINARY O_BINARY -#endif #endif #define LOG_ERROR(label) \ @@ -58,7 +52,7 @@ static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { FILE *set_binary_mode(FILE *stream) { (void)stream; -#if defined(_WIN32) || defined(__OS2__) +#if defined(_WIN32) _setmode(_fileno(stream), _O_BINARY); #endif return stream; @@ -96,9 +90,9 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) { int w = vpx_img_plane_width(yuv_frame, plane); const int h = vpx_img_plane_height(yuv_frame, plane); int r; - // Assuming that for nv12 we read all chroma data at one time + // Assuming that for nv12 we read all chroma data at once if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; - // Fixing NV12 chroma width it is odd + // Fixing NV12 chroma width if it is odd if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; /* Determine the correct plane based on the image format. The for-loop * always counts in Y,U,V order, but this may not match the order of @@ -229,17 +223,22 @@ int vpx_img_plane_height(const vpx_image_t *img, int plane) { void vpx_img_write(const vpx_image_t *img, FILE *file) { int plane; + const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = vpx_img_plane_width(img, plane) * - ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int w = vpx_img_plane_width(img, plane); const int h = vpx_img_plane_height(img, plane); int y; + // Assuming that for nv12 we write all chroma data at once + if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; + // Fixing NV12 chroma width if it is odd + if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; + for (y = 0; y < h; ++y) { - fwrite(buf, 1, w, file); + fwrite(buf, bytespp, w, file); buf += stride; } } @@ -247,17 +246,22 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) { int vpx_img_read(vpx_image_t *img, FILE *file) { int plane; + const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = vpx_img_plane_width(img, plane) * - ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int w = vpx_img_plane_width(img, plane); const int h = vpx_img_plane_height(img, plane); int y; + // Assuming that for nv12 we read all chroma data at once + if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; + // Fixing NV12 chroma width if it is odd + if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; + for (y = 0; y < h; ++y) { - if (fread(buf, 1, w, file) != (size_t)w) return 0; + if (fread(buf, bytespp, w, file) != (size_t)w) return 0; buf += stride; } } diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c index ee3c281f0f..a54e81084b 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -16,7 +16,7 @@ #include "vpx_ports/mem.h" static const int8_t vp8_sub_pel_filters[8][8] = { - { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */ + { 0, 0, -128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */ { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */ { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ { 0, -9, 93, 50, -6, 0, 0, 0 }, diff --git a/media/libvpx/libvpx/vp8/common/entropy.c b/media/libvpx/libvpx/vp8/common/entropy.c index fc4a3539fd..b9efc0cc1f 100644 --- a/media/libvpx/libvpx/vp8/common/entropy.c +++ b/media/libvpx/libvpx/vp8/common/entropy.c @@ -114,7 +114,7 @@ static const vp8_prob Pcat6[] = { 254, 254, 243, 230, 196, 177, p[0] = p[1] = 0; } - void init_bit_trees() { + void init_bit_trees(void) { init_bit_tree(cat1, 1); init_bit_tree(cat2, 2); init_bit_tree(cat3, 3); diff --git a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c index 71529bdfd8..7c8e083f4f 100644 --- a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c +++ b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c @@ -25,23 +25,19 @@ #include "vp8/common/systemdependent.h" #if CONFIG_MULTITHREAD -#if HAVE_UNISTD_H && !defined(__OS2__) +#if HAVE_UNISTD_H #include <unistd.h> #elif defined(_WIN32) #include <windows.h> typedef void(WINAPI *PGNSI)(LPSYSTEM_INFO); -#elif defined(__OS2__) -#define INCL_DOS -#define INCL_DOSSPINLOCK -#include <os2.h> #endif #endif #if CONFIG_MULTITHREAD -static int get_cpu_count() { +static int get_cpu_count(void) { int core_count = 16; -#if HAVE_UNISTD_H && !defined(__OS2__) +#if HAVE_UNISTD_H #if defined(_SC_NPROCESSORS_ONLN) core_count = (int)sysconf(_SC_NPROCESSORS_ONLN); #elif defined(_SC_NPROC_ONLN) @@ -49,38 +45,13 @@ static int get_cpu_count() { #endif #elif defined(_WIN32) { -#if _WIN32_WINNT >= 0x0501 +#if _WIN32_WINNT < 0x0501 +#error _WIN32_WINNT must target Windows XP or newer. +#endif SYSTEM_INFO sysinfo; GetNativeSystemInfo(&sysinfo); -#else - PGNSI pGNSI; - SYSTEM_INFO sysinfo; - - /* Call GetNativeSystemInfo if supported or - * GetSystemInfo otherwise. */ - - pGNSI = (PGNSI)GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), - "GetNativeSystemInfo"); - if (pGNSI != NULL) - pGNSI(&sysinfo); - else - GetSystemInfo(&sysinfo); -#endif - core_count = (int)sysinfo.dwNumberOfProcessors; } -#elif defined(__OS2__) - { - ULONG proc_id; - ULONG status; - - core_count = 0; - for (proc_id = 1;; ++proc_id) { - if (DosGetProcessorStatus(proc_id, &status)) break; - - if (status == PROC_ONLINE) core_count++; - } - } #else /* other platforms */ #endif diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h index 1b70ea5dba..2038c000b0 100644 --- a/media/libvpx/libvpx/vp8/common/onyx.h +++ b/media/libvpx/libvpx/vp8/common/onyx.h @@ -242,7 +242,7 @@ typedef struct { #endif } VP8_CONFIG; -void vp8_initialize(); +void vp8_initialize(void); struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf); void vp8_remove_compressor(struct VP8_COMP **comp); diff --git a/media/libvpx/libvpx/vp8/common/rtcd.c b/media/libvpx/libvpx/vp8/common/rtcd.c index 09a0e2b4b3..102b7ccd54 100644 --- a/media/libvpx/libvpx/vp8/common/rtcd.c +++ b/media/libvpx/libvpx/vp8/common/rtcd.c @@ -12,4 +12,4 @@ #include "./vp8_rtcd.h" #include "vpx_ports/vpx_once.h" -void vp8_rtcd() { once(setup_rtcd_internal); } +void vp8_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vp8/common/threading.h b/media/libvpx/libvpx/vp8/common/threading.h index 1cfb9fec51..0de75cfde3 100644 --- a/media/libvpx/libvpx/vp8/common/threading.h +++ b/media/libvpx/libvpx/vp8/common/threading.h @@ -19,161 +19,57 @@ extern "C" { #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD -/* Thread management macros */ #if defined(_WIN32) && !HAVE_PTHREAD_H /* Win32 */ -#include <process.h> #include <windows.h> -#if defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) -#define THREAD_FUNCTION \ - __attribute__((force_align_arg_pointer)) unsigned int __stdcall -#else -#define THREAD_FUNCTION unsigned int __stdcall -#endif -#define THREAD_FUNCTION_RETURN DWORD -#define THREAD_SPECIFIC_INDEX DWORD -#define pthread_t HANDLE -#define pthread_attr_t DWORD -#define pthread_detach(thread) \ - if (thread != NULL) CloseHandle(thread) -#define thread_sleep(nms) Sleep(nms) -#define pthread_cancel(thread) terminate_thread(thread, 0) -#define ts_key_create(ts_key, destructor) \ - { ts_key = TlsAlloc(); }; -#define pthread_getspecific(ts_key) TlsGetValue(ts_key) -#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value) -#define pthread_self() GetCurrentThreadId() - -#elif defined(__OS2__) -/* OS/2 */ -#define INCL_DOS -#include <os2.h> - -#include <stdlib.h> -#define THREAD_FUNCTION void * -#define THREAD_FUNCTION_RETURN void * -#define THREAD_SPECIFIC_INDEX PULONG -#define pthread_t TID -#define pthread_attr_t ULONG -#define pthread_detach(thread) 0 -#define thread_sleep(nms) DosSleep(nms) -#define pthread_cancel(thread) DosKillThread(thread) -#define ts_key_create(ts_key, destructor) \ - DosAllocThreadLocalMemory(1, &(ts_key)); -#define pthread_getspecific(ts_key) ((void *)(*(ts_key))) -#define pthread_setspecific(ts_key, value) (*(ts_key) = (ULONG)(value)) -#define pthread_self() _gettid() #else +/* pthreads */ #ifdef __APPLE__ #include <mach/mach_init.h> #include <mach/semaphore.h> #include <mach/task.h> #include <time.h> #include <unistd.h> - #else #include <semaphore.h> #endif - -#include <pthread.h> -/* pthreads */ -/* Nearly everything is already defined */ -#define THREAD_FUNCTION void * -#define THREAD_FUNCTION_RETURN void * -#define THREAD_SPECIFIC_INDEX pthread_key_t -#define ts_key_create(ts_key, destructor) \ - pthread_key_create(&(ts_key), destructor); #endif /* Synchronization macros: Win32 and Pthreads */ #if defined(_WIN32) && !HAVE_PTHREAD_H -#define sem_t HANDLE -#define pause(voidpara) __asm PAUSE -#define sem_init(sem, sem_attr1, sem_init_value) \ - (int)((*sem = CreateSemaphore(NULL, 0, 32768, NULL)) == NULL) -#define sem_wait(sem) \ +#define vp8_sem_t HANDLE +#define vp8_sem_init(sem, pshared, value) \ + (int)((*sem = CreateSemaphore(NULL, value, 32768, NULL)) == NULL) +#define vp8_sem_wait(sem) \ (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem, INFINITE)) -#define sem_post(sem) ReleaseSemaphore(*sem, 1, NULL) -#define sem_destroy(sem) \ +#define vp8_sem_post(sem) ReleaseSemaphore(*sem, 1, NULL) +#define vp8_sem_destroy(sem) \ if (*sem) ((int)(CloseHandle(*sem)) == TRUE) #define thread_sleep(nms) Sleep(nms) -#elif defined(__OS2__) -typedef struct { - HEV event; - HMTX wait_mutex; - HMTX count_mutex; - int count; -} sem_t; - -static inline int sem_init(sem_t *sem, int pshared, unsigned int value) { - DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0, - value > 0 ? TRUE : FALSE); - DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE); - DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE); - - sem->count = value; - - return 0; -} - -static inline int sem_wait(sem_t *sem) { - DosRequestMutexSem(sem->wait_mutex, -1); - - DosWaitEventSem(sem->event, -1); - - DosRequestMutexSem(sem->count_mutex, -1); - - sem->count--; - if (sem->count == 0) { - ULONG post_count; - - DosResetEventSem(sem->event, &post_count); - } - - DosReleaseMutexSem(sem->count_mutex); - - DosReleaseMutexSem(sem->wait_mutex); - - return 0; -} - -static inline int sem_post(sem_t *sem) { - DosRequestMutexSem(sem->count_mutex, -1); - - if (sem->count < 32768) { - sem->count++; - DosPostEventSem(sem->event); - } - - DosReleaseMutexSem(sem->count_mutex); - - return 0; -} - -static inline int sem_destroy(sem_t *sem) { - DosCloseEventSem(sem->event); - DosCloseMutexSem(sem->wait_mutex); - DosCloseMutexSem(sem->count_mutex); - - return 0; -} - -#define thread_sleep(nms) DosSleep(nms) - #else #ifdef __APPLE__ -#define sem_t semaphore_t -#define sem_init(X, Y, Z) \ - semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z) -#define sem_wait(sem) (semaphore_wait(*sem)) -#define sem_post(sem) semaphore_signal(*sem) -#define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem) +#define vp8_sem_t semaphore_t +#define vp8_sem_init(sem, pshared, value) \ + semaphore_create(mach_task_self(), sem, SYNC_POLICY_FIFO, value) +#define vp8_sem_wait(sem) semaphore_wait(*sem) +#define vp8_sem_post(sem) semaphore_signal(*sem) +#define vp8_sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem) #else +#include <errno.h> #include <unistd.h> #include <sched.h> +#define vp8_sem_t sem_t +#define vp8_sem_init sem_init +static INLINE int vp8_sem_wait(vp8_sem_t *sem) { + int ret; + while ((ret = sem_wait(sem)) == -1 && errno == EINTR) { + } + return ret; +} +#define vp8_sem_post sem_post +#define vp8_sem_destroy sem_destroy #endif /* __APPLE__ */ /* Not Windows. Assume pthreads */ @@ -194,7 +90,6 @@ static inline int sem_destroy(sem_t *sem) { #define x86_pause_hint() #endif -#include "vpx_util/vpx_thread.h" #include "vpx_util/vpx_atomics.h" static INLINE void vp8_atomic_spin_wait( diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c index 2248345ba2..88f2de024b 100644 --- a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c +++ b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c @@ -428,6 +428,7 @@ int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) { #if CONFIG_MULTITHREAD if (setjmp(fb->pbi[0]->common.error.jmp)) { + fb->pbi[0]->common.error.setjmp = 0; vp8_remove_decoder_instances(fb); vp8_zero(fb->pbi); vpx_clear_system_state(); @@ -452,6 +453,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb) { /* decoder instance for single thread mode */ remove_decompressor(pbi); + fb->pbi[0] = NULL; return VPX_CODEC_OK; } diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h index 1070849620..08a60b31b9 100644 --- a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h +++ b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h @@ -14,6 +14,7 @@ #include <assert.h> #include "vpx_config.h" +#include "vpx_util/vpx_pthread.h" #include "vp8/common/onyxd.h" #include "treereader.h" #include "vp8/common/onyxc_int.h" @@ -94,8 +95,8 @@ typedef struct VP8D_COMP { DECODETHREAD_DATA *de_thread_data; pthread_t *h_decoding_thread; - sem_t *h_event_start_decoding; - sem_t h_event_end_decoding; + vp8_sem_t *h_event_start_decoding; + vp8_sem_t h_event_end_decoding; /* end of threading data */ #endif diff --git a/media/libvpx/libvpx/vp8/decoder/threading.c b/media/libvpx/libvpx/vp8/decoder/threading.c index 6ccb080cf9..d16284d134 100644 --- a/media/libvpx/libvpx/vp8/decoder/threading.c +++ b/media/libvpx/libvpx/vp8/decoder/threading.c @@ -15,6 +15,7 @@ #endif #include "onyxd_int.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_pthread.h" #include "vp8/common/common.h" #include "vp8/common/threading.h" #include "vp8/common/loopfilter.h" @@ -577,10 +578,10 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, /* signal end of decoding of current thread for current frame */ if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows) - sem_post(&pbi->h_event_end_decoding); + vp8_sem_post(&pbi->h_event_end_decoding); } -static THREAD_FUNCTION thread_decoding_proc(void *p_data) { +static THREADFN thread_decoding_proc(void *p_data) { int ithread = ((DECODETHREAD_DATA *)p_data)->ithread; VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1); MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2); @@ -589,7 +590,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) { while (1) { if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break; - if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { + if (vp8_sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) { break; } else { @@ -598,16 +599,17 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) { if (setjmp(xd->error_info.jmp)) { xd->error_info.setjmp = 0; // Signal the end of decoding for current thread. - sem_post(&pbi->h_event_end_decoding); + vp8_sem_post(&pbi->h_event_end_decoding); continue; } xd->error_info.setjmp = 1; mt_decode_mb_rows(pbi, xd, ithread + 1); + xd->error_info.setjmp = 0; } } } - return 0; + return THREAD_EXIT_SUCCESS; } void vp8_decoder_create_threads(VP8D_COMP *pbi) { @@ -634,13 +636,13 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32); CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count); - if (sem_init(&pbi->h_event_end_decoding, 0, 0)) { + if (vp8_sem_init(&pbi->h_event_end_decoding, 0, 0)) { vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, "Failed to initialize semaphore"); } for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) { - if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break; + if (vp8_sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break; vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd); @@ -650,7 +652,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { if (pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, &pbi->de_thread_data[ithread])) { - sem_destroy(&pbi->h_event_start_decoding[ithread]); + vp8_sem_destroy(&pbi->h_event_start_decoding[ithread]); break; } } @@ -661,7 +663,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { /* the remainder of cleanup cases will be handled in * vp8_decoder_remove_threads(). */ if (pbi->allocated_decoding_thread_count == 0) { - sem_destroy(&pbi->h_event_end_decoding); + vp8_sem_destroy(&pbi->h_event_end_decoding); } vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, "Failed to create threads"); @@ -812,16 +814,16 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) { /* allow all threads to exit */ for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { - sem_post(&pbi->h_event_start_decoding[i]); + vp8_sem_post(&pbi->h_event_start_decoding[i]); pthread_join(pbi->h_decoding_thread[i], NULL); } for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { - sem_destroy(&pbi->h_event_start_decoding[i]); + vp8_sem_destroy(&pbi->h_event_start_decoding[i]); } if (pbi->allocated_decoding_thread_count) { - sem_destroy(&pbi->h_event_end_decoding); + vp8_sem_destroy(&pbi->h_event_end_decoding); } vpx_free(pbi->h_decoding_thread); @@ -883,7 +885,7 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { pbi->decoding_thread_count); for (i = 0; i < pbi->decoding_thread_count; ++i) { - sem_post(&pbi->h_event_start_decoding[i]); + vp8_sem_post(&pbi->h_event_start_decoding[i]); } if (setjmp(xd->error_info.jmp)) { @@ -893,15 +895,16 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { // the current frame while the main thread starts decoding the next frame, // which causes a data race. for (i = 0; i < pbi->decoding_thread_count; ++i) - sem_wait(&pbi->h_event_end_decoding); + vp8_sem_wait(&pbi->h_event_end_decoding); return -1; } xd->error_info.setjmp = 1; mt_decode_mb_rows(pbi, xd, 0); + xd->error_info.setjmp = 0; for (i = 0; i < pbi->decoding_thread_count + 1; ++i) - sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */ + vp8_sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */ return 0; } diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.c b/media/libvpx/libvpx/vp8/encoder/encodeframe.c index 82c48b13a7..d0117897db 100644 --- a/media/libvpx/libvpx/vp8/encoder/encodeframe.c +++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.c @@ -7,38 +7,38 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include <stdio.h> #include <limits.h> +#include <stdio.h> #include "vpx_config.h" -#include "vp8_rtcd.h" -#include "./vpx_dsp_rtcd.h" -#include "bitstream.h" -#include "encodemb.h" -#include "encodemv.h" -#if CONFIG_MULTITHREAD -#include "ethreading.h" -#endif + #include "vp8/common/common.h" -#include "onyx_int.h" -#include "vp8/common/extend.h" #include "vp8/common/entropymode.h" -#include "vp8/common/quant_common.h" -#include "segmentation.h" -#include "vp8/common/setupintrarecon.h" -#include "encodeintra.h" -#include "vp8/common/reconinter.h" -#include "rdopt.h" -#include "pickinter.h" +#include "vp8/common/extend.h" #include "vp8/common/findnearmv.h" #include "vp8/common/invtrans.h" +#include "vp8/common/quant_common.h" +#include "vp8/common/reconinter.h" +#include "vp8/common/setupintrarecon.h" +#include "vp8/common/threading.h" +#include "vp8/encoder/bitstream.h" +#include "vp8/encoder/encodeframe.h" +#include "vp8/encoder/encodeintra.h" +#include "vp8/encoder/encodemb.h" +#include "vp8/encoder/encodemv.h" +#include "vp8/encoder/onyx_int.h" +#include "vp8/encoder/pickinter.h" +#include "vp8/encoder/rdopt.h" +#include "vp8/encoder/segmentation.h" +#include "vp8_rtcd.h" #include "vpx/internal/vpx_codec_internal.h" +#include "vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/vpx_timer.h" -#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING -#include "bitstream.h" + +#if CONFIG_MULTITHREAD +#include "vp8/encoder/ethreading.h" #endif -#include "encodeframe.h" extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x); @@ -773,7 +773,7 @@ void vp8_encode_frame(VP8_COMP *cpi) { vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1); for (i = 0; i < cpi->encoding_thread_count; ++i) { - sem_post(&cpi->h_event_start_encoding[i]); + vp8_sem_post(&cpi->h_event_start_encoding[i]); } for (mb_row = 0; mb_row < cm->mb_rows; @@ -806,7 +806,7 @@ void vp8_encode_frame(VP8_COMP *cpi) { } /* Wait for all the threads to finish. */ for (i = 0; i < cpi->encoding_thread_count; ++i) { - sem_wait(&cpi->h_event_end_encoding[i]); + vp8_sem_wait(&cpi->h_event_end_encoding[i]); } for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.c b/media/libvpx/libvpx/vp8/encoder/ethreading.c index e2f8b89d46..98c87d3cbc 100644 --- a/media/libvpx/libvpx/vp8/encoder/ethreading.c +++ b/media/libvpx/libvpx/vp8/encoder/ethreading.c @@ -10,6 +10,7 @@ #include <stddef.h> #include "onyx_int.h" +#include "vpx_util/vpx_pthread.h" #include "vp8/common/threading.h" #include "vp8/common/common.h" #include "vp8/common/extend.h" @@ -22,27 +23,27 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip); -static THREAD_FUNCTION thread_loopfilter(void *p_data) { +static THREADFN thread_loopfilter(void *p_data) { VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1); VP8_COMMON *cm = &cpi->common; while (1) { if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; - if (sem_wait(&cpi->h_event_start_lpf) == 0) { + if (vp8_sem_wait(&cpi->h_event_start_lpf) == 0) { /* we're shutting down */ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; vp8_loopfilter_frame(cpi, cm); - sem_post(&cpi->h_event_end_lpf); + vp8_sem_post(&cpi->h_event_end_lpf); } } - return 0; + return THREAD_EXIT_SUCCESS; } -static THREAD_FUNCTION thread_encoding_proc(void *p_data) { +static THREADFN thread_encoding_proc(void *p_data) { int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread; VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1); MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2); @@ -51,7 +52,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { while (1) { if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; - if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { + if (vp8_sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { const int nsync = cpi->mt_sync_range; VP8_COMMON *cm = &cpi->common; int mb_row; @@ -307,12 +308,12 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count; } /* Signal that this thread has completed processing its rows. */ - sem_post(&cpi->h_event_end_encoding[ithread]); + vp8_sem_post(&cpi->h_event_end_encoding[ithread]); } } /* printf("exit thread %d\n", ithread); */ - return 0; + return THREAD_EXIT_SUCCESS; } static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) { @@ -514,9 +515,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count)); CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding, - vpx_malloc(sizeof(sem_t) * th_count)); + vpx_malloc(sizeof(vp8_sem_t) * th_count)); CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding, - vpx_malloc(sizeof(sem_t) * th_count)); + vpx_malloc(sizeof(vp8_sem_t) * th_count)); CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count)); memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count); @@ -538,8 +539,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb); vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd); - sem_init(&cpi->h_event_start_encoding[ithread], 0, 0); - sem_init(&cpi->h_event_end_encoding[ithread], 0, 0); + vp8_sem_init(&cpi->h_event_start_encoding[ithread], 0, 0); + vp8_sem_init(&cpi->h_event_end_encoding[ithread], 0, 0); ethd->ithread = ithread; ethd->ptr1 = (void *)cpi; @@ -554,11 +555,11 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { /* shutdown other threads */ vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { - sem_post(&cpi->h_event_start_encoding[ithread]); - sem_post(&cpi->h_event_end_encoding[ithread]); + vp8_sem_post(&cpi->h_event_start_encoding[ithread]); + vp8_sem_post(&cpi->h_event_end_encoding[ithread]); pthread_join(cpi->h_encoding_thread[ithread], 0); - sem_destroy(&cpi->h_event_start_encoding[ithread]); - sem_destroy(&cpi->h_event_end_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]); } /* free thread related resources */ @@ -580,8 +581,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { { LPFTHREAD_DATA *lpfthd = &cpi->lpf_thread_data; - sem_init(&cpi->h_event_start_lpf, 0, 0); - sem_init(&cpi->h_event_end_lpf, 0, 0); + vp8_sem_init(&cpi->h_event_start_lpf, 0, 0); + vp8_sem_init(&cpi->h_event_end_lpf, 0, 0); lpfthd->ptr1 = (void *)cpi; rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter, lpfthd); @@ -590,14 +591,14 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { /* shutdown other threads */ vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { - sem_post(&cpi->h_event_start_encoding[ithread]); - sem_post(&cpi->h_event_end_encoding[ithread]); + vp8_sem_post(&cpi->h_event_start_encoding[ithread]); + vp8_sem_post(&cpi->h_event_end_encoding[ithread]); pthread_join(cpi->h_encoding_thread[ithread], 0); - sem_destroy(&cpi->h_event_start_encoding[ithread]); - sem_destroy(&cpi->h_event_end_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]); } - sem_destroy(&cpi->h_event_end_lpf); - sem_destroy(&cpi->h_event_start_lpf); + vp8_sem_destroy(&cpi->h_event_end_lpf); + vp8_sem_destroy(&cpi->h_event_start_lpf); /* free thread related resources */ vpx_free(cpi->h_event_start_encoding); @@ -627,21 +628,21 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { int i; for (i = 0; i < cpi->encoding_thread_count; ++i) { - sem_post(&cpi->h_event_start_encoding[i]); - sem_post(&cpi->h_event_end_encoding[i]); + vp8_sem_post(&cpi->h_event_start_encoding[i]); + vp8_sem_post(&cpi->h_event_end_encoding[i]); pthread_join(cpi->h_encoding_thread[i], 0); - sem_destroy(&cpi->h_event_start_encoding[i]); - sem_destroy(&cpi->h_event_end_encoding[i]); + vp8_sem_destroy(&cpi->h_event_start_encoding[i]); + vp8_sem_destroy(&cpi->h_event_end_encoding[i]); } - sem_post(&cpi->h_event_start_lpf); + vp8_sem_post(&cpi->h_event_start_lpf); pthread_join(cpi->h_filter_thread, 0); } - sem_destroy(&cpi->h_event_end_lpf); - sem_destroy(&cpi->h_event_start_lpf); + vp8_sem_destroy(&cpi->h_event_end_lpf); + vp8_sem_destroy(&cpi->h_event_start_lpf); cpi->b_lpf_running = 0; /* free thread related resources */ diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/libvpx/vp8/encoder/onyx_if.c index 4e128e3c49..ad01c6fc86 100644 --- a/media/libvpx/libvpx/vp8/encoder/onyx_if.c +++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c @@ -63,7 +63,7 @@ extern int vp8_update_coef_context(VP8_COMP *cpi); #endif -extern unsigned int vp8_get_processor_freq(); +extern unsigned int vp8_get_processor_freq(void); int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); @@ -267,7 +267,11 @@ static int rescale(int val, int num, int denom) { int64_t llden = denom; int64_t llval = val; - return (int)(llval * llnum / llden); + int64_t result = (llval * llnum / llden); + if (result <= INT_MAX) + return (int)result; + else + return INT_MAX; } void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, @@ -276,7 +280,10 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, LAYER_CONTEXT *lc = &cpi->layer_context[layer]; lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer]; - lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000; + if (cpi->oxcf.target_bitrate[layer] > INT_MAX / 1000) + lc->target_bandwidth = INT_MAX; + else + lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000; lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level; lc->optimal_buffer_level_in_ms = oxcf->optimal_buffer_level; @@ -1381,7 +1388,10 @@ void vp8_update_layer_contexts(VP8_COMP *cpi) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; lc->framerate = cpi->ref_framerate / oxcf->rate_decimator[i]; - lc->target_bandwidth = oxcf->target_bitrate[i] * 1000; + if (oxcf->target_bitrate[i] > INT_MAX / 1000) + lc->target_bandwidth = INT_MAX; + else + lc->target_bandwidth = oxcf->target_bitrate[i] * 1000; lc->starting_buffer_level = rescale( (int)oxcf->starting_buffer_level_in_ms, lc->target_bandwidth, 1000); @@ -1995,6 +2005,7 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { #if CONFIG_MULTITHREAD if (vp8cx_create_encoder_threads(cpi)) { + cpi->common.error.setjmp = 0; vp8_remove_compressor(&cpi); return 0; } @@ -2048,8 +2059,6 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { vp8_loop_filter_init(cm); - cpi->common.error.setjmp = 0; - #if CONFIG_MULTI_RES_ENCODING /* Calculate # of MBs in a row in lower-resolution level image. */ @@ -2076,6 +2085,8 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { vp8_setup_block_ptrs(&cpi->mb); vp8_setup_block_dptrs(&cpi->mb.e_mbd); + cpi->common.error.setjmp = 0; + return cpi; } @@ -3172,7 +3183,8 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { #if CONFIG_MULTITHREAD if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { - sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ + /* signal that we have set filter_level */ + vp8_sem_post(&cpi->h_event_end_lpf); } #endif @@ -4387,11 +4399,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #if CONFIG_MULTITHREAD if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { /* start loopfilter in separate thread */ - sem_post(&cpi->h_event_start_lpf); + vp8_sem_post(&cpi->h_event_start_lpf); cpi->b_lpf_running = 1; /* wait for the filter_level to be picked so that we can continue with * stream packing */ - sem_wait(&cpi->h_event_end_lpf); + vp8_sem_wait(&cpi->h_event_end_lpf); } else #endif { @@ -5120,6 +5132,14 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); +#if CONFIG_MULTITHREAD + /* wait for the lpf thread done */ + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) { + vp8_sem_wait(&cpi->h_event_end_lpf); + cpi->b_lpf_running = 0; + } +#endif + if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) { generate_psnr_packet(cpi); } @@ -5247,16 +5267,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, #endif #endif - cpi->common.error.setjmp = 0; - -#if CONFIG_MULTITHREAD - /* wait for the lpf thread done */ - if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) { - sem_wait(&cpi->h_event_end_lpf); - cpi->b_lpf_running = 0; - } -#endif - return 0; } diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_int.h b/media/libvpx/libvpx/vp8/encoder/onyx_int.h index 1451a27812..bb1518ed7f 100644 --- a/media/libvpx/libvpx/vp8/encoder/onyx_int.h +++ b/media/libvpx/libvpx/vp8/encoder/onyx_int.h @@ -20,6 +20,7 @@ #include "tokenize.h" #include "vp8/common/onyxc_int.h" #include "vpx_dsp/variance.h" +#include "vpx_util/vpx_pthread.h" #include "encodemb.h" #include "vp8/encoder/quantize.h" #include "vp8/common/entropy.h" @@ -540,10 +541,10 @@ typedef struct VP8_COMP { LPFTHREAD_DATA lpf_thread_data; /* events */ - sem_t *h_event_start_encoding; - sem_t *h_event_end_encoding; - sem_t h_event_start_lpf; - sem_t h_event_end_lpf; + vp8_sem_t *h_event_start_encoding; + vp8_sem_t *h_event_end_encoding; + vp8_sem_t h_event_start_lpf; + vp8_sem_t h_event_end_lpf; #endif TOKENLIST *tplist; diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.c b/media/libvpx/libvpx/vp8/encoder/ratectrl.c index fcd4eb04eb..7ba7a308ab 100644 --- a/media/libvpx/libvpx/vp8/encoder/ratectrl.c +++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.c @@ -791,8 +791,12 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { (int)((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) / one_percent_bits); } else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) { - percent_high = - (int)((100 * cpi->bits_off_target) / (cpi->total_byte_count * 8)); + if (cpi->total_byte_count > 0) { + percent_high = (int)((100 * cpi->bits_off_target) / + (cpi->total_byte_count * 8)); + } else { + percent_high = cpi->oxcf.over_shoot_pct; + } } if (percent_high > cpi->oxcf.over_shoot_pct) { @@ -1190,10 +1194,13 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { /* Calculate required scaling factor based on target frame size and * size of frame produced using previous Q */ - if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) { - /* Case where we would overflow int */ - target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) - << BPER_MB_NORMBITS; + if (target_bits_per_frame > (INT_MAX >> BPER_MB_NORMBITS)) { + int temp = target_bits_per_frame / cpi->common.MBs; + if (temp > (INT_MAX >> BPER_MB_NORMBITS)) { + target_bits_per_mb = INT_MAX; + } else { + target_bits_per_mb = temp << BPER_MB_NORMBITS; + } } else { target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs; @@ -1534,9 +1541,13 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { // undershoots significantly, and then we end up dropping every other // frame because the QP/rate_correction_factor may have been too low // before the drop and then takes too long to come up. - if (target_size >= (INT_MAX >> BPER_MB_NORMBITS)) { - target_bits_per_mb = (target_size / cpi->common.MBs) - << BPER_MB_NORMBITS; + if (target_size > (INT_MAX >> BPER_MB_NORMBITS)) { + int temp = target_size / cpi->common.MBs; + if (temp > (INT_MAX >> BPER_MB_NORMBITS)) { + target_bits_per_mb = INT_MAX; + } else { + target_bits_per_mb = temp << BPER_MB_NORMBITS; + } } else { target_bits_per_mb = (target_size << BPER_MB_NORMBITS) / cpi->common.MBs; diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.h b/media/libvpx/libvpx/vp8/encoder/tokenize.h index 47b5be17f1..5223aa2d86 100644 --- a/media/libvpx/libvpx/vp8/encoder/tokenize.h +++ b/media/libvpx/libvpx/vp8/encoder/tokenize.h @@ -18,8 +18,6 @@ extern "C" { #endif -void vp8_tokenize_initialize(); - typedef struct { short Token; short Extra; diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c index 1f16cc53d3..2b238c1a97 100644 --- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c +++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c @@ -8,6 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <limits.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + #include "./vpx_config.h" #include "./vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" @@ -18,6 +23,7 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/static_assert.h" #include "vpx_ports/system_state.h" +#include "vpx_util/vpx_thread.h" #include "vpx_util/vpx_timestamp.h" #if CONFIG_MULTITHREAD #include "vp8/encoder/ethreading.h" @@ -27,8 +33,6 @@ #include "vp8/encoder/firstpass.h" #include "vp8/common/onyx.h" #include "vp8/common/common.h" -#include <stdlib.h> -#include <string.h> struct vp8_extracfg { struct vpx_codec_pkt_list *pkt_list; @@ -148,7 +152,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, g_profile, 3); RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); - RANGE_CHECK_HI(cfg, g_threads, 64); + RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS); #if CONFIG_REALTIME_ONLY RANGE_CHECK_HI(cfg, g_lag_in_frames, 0); #elif CONFIG_MULTI_RES_ENCODING @@ -495,7 +499,10 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); #if CONFIG_MULTITHREAD - if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR; + if (vp8cx_create_encoder_threads(ctx->cpi)) { + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_ERROR; + } #endif ctx->cpi->common.error.setjmp = 0; return VPX_CODEC_OK; @@ -777,9 +784,9 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, return res; } -static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, - unsigned long duration, - vpx_enc_deadline_t deadline) { +static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, + unsigned long duration, + vpx_enc_deadline_t deadline) { int new_qc; #if !(CONFIG_REALTIME_ONLY) @@ -788,13 +795,15 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, if (deadline) { /* Convert duration parameter from stream timebase to microseconds */ - uint64_t duration_us; - VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 && (TICKS_PER_SEC % 1000000) == 0); - duration_us = duration * (uint64_t)ctx->timestamp_ratio.num / - (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); + if (duration > UINT64_MAX / (uint64_t)ctx->timestamp_ratio.num) { + ERROR("duration is too big"); + } + uint64_t duration_us = + duration * (uint64_t)ctx->timestamp_ratio.num / + ((uint64_t)ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); /* If the deadline is more that the duration this frame is to be shown, * use good quality mode. Otherwise use realtime mode. @@ -820,6 +829,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, ctx->oxcf.Mode = new_qc; vp8_change_config(ctx->cpi, &ctx->oxcf); } + return VPX_CODEC_OK; } static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx, @@ -894,13 +904,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1); - if (!ctx->pts_offset_initialized) { - ctx->pts_offset = pts_val; - ctx->pts_offset_initialized = 1; - } - pts_val -= ctx->pts_offset; - - pick_quickcompress_mode(ctx, duration, deadline); + if (!res) res = pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); // If no flags are set in the encode call, then use the frame flags as @@ -924,7 +928,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, /* Initialize the encoder instance on the first frame*/ if (!res && ctx->cpi) { unsigned int lib_flags; - YV12_BUFFER_CONFIG sd; int64_t dst_time_stamp, dst_end_time_stamp; size_t size, cx_data_sz; unsigned char *cx_data; @@ -951,12 +954,44 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, /* Convert API flags to internal codec lib flags */ lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; - dst_time_stamp = - pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; - dst_end_time_stamp = (pts_val + (int64_t)duration) * - ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; - if (img != NULL) { + YV12_BUFFER_CONFIG sd; + + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = pts_val; + ctx->pts_offset_initialized = 1; + } + if (pts_val < ctx->pts_offset) { + vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "pts is smaller than initial pts"); + } + pts_val -= ctx->pts_offset; + if (pts_val > INT64_MAX / ctx->timestamp_ratio.num) { + vpx_internal_error( + &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts to ticks would overflow"); + } + dst_time_stamp = + pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; +#if ULONG_MAX > INT64_MAX + if (duration > INT64_MAX) { + vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "duration is too big"); + } +#endif + if (pts_val > INT64_MAX - (int64_t)duration) { + vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "relative pts + duration is too big"); + } + vpx_codec_pts_t pts_end = pts_val + (int64_t)duration; + if (pts_end > INT64_MAX / ctx->timestamp_ratio.num) { + vpx_internal_error( + &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts + duration to ticks would overflow"); + } + dst_end_time_stamp = + pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; + res = image2yuvconfig(img, &sd); if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { @@ -989,6 +1024,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, &dst_end_time_stamp, !img); if (comp_data_state == VPX_CODEC_CORRUPT_FRAME) { + ctx->cpi->common.error.setjmp = 0; return VPX_CODEC_CORRUPT_FRAME; } else if (comp_data_state == -1) { break; diff --git a/media/libvpx/libvpx/vp8/vp8_dx_iface.c b/media/libvpx/libvpx/vp8/vp8_dx_iface.c index e81deaf4ea..fa7d7be403 100644 --- a/media/libvpx/libvpx/vp8/vp8_dx_iface.c +++ b/media/libvpx/libvpx/vp8/vp8_dx_iface.c @@ -488,7 +488,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) { pc->fb_idx_ref_cnt[pc->new_fb_idx]--; } - pc->error.setjmp = 0; + pbi->common.error.setjmp = 0; #if CONFIG_MULTITHREAD if (pbi->restart_threads) { ctx->si.w = 0; diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc index 261c316fd1..312092f190 100644 --- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc +++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc @@ -8,10 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "vp8/vp8_ratectrl_rtc.h" + #include <math.h> + #include <new> + #include "vp8/common/common.h" -#include "vp8/vp8_ratectrl_rtc.h" #include "vp8/encoder/onyx_int.h" #include "vp8/encoder/ratectrl.h" #include "vpx_ports/system_state.h" @@ -311,6 +314,14 @@ FrameDropDecision VP8RateControlRTC::ComputeQP( int VP8RateControlRTC::GetQP() const { return q_; } +UVDeltaQP VP8RateControlRTC::GetUVDeltaQP() const { + VP8_COMMON *cm = &cpi_->common; + UVDeltaQP uv_delta_q; + uv_delta_q.uvdc_delta_q = cm->uvdc_delta_q; + uv_delta_q.uvac_delta_q = cm->uvac_delta_q; + return uv_delta_q; +} + int VP8RateControlRTC::GetLoopfilterLevel() const { VP8_COMMON *cm = &cpi_->common; const double qp = q_; diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h index 59fb607526..b458b5ce65 100644 --- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h +++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h @@ -21,7 +21,6 @@ struct VP8_COMP; namespace libvpx { struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig { - public: VP8RateControlRtcConfig() { memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate)); memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator)); @@ -42,6 +41,9 @@ class VP8RateControlRTC { bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; + // GetUVDeltaQP() needs to be called after ComputeQP() to get the latest + // delta QP for UV. + UVDeltaQP GetUVDeltaQP() const; // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter // level is calculated from frame qp. int GetLoopfilterLevel() const; @@ -53,10 +55,10 @@ class VP8RateControlRTC { void PostEncodeUpdate(uint64_t encoded_frame_size); private: - VP8RateControlRTC() {} + VP8RateControlRTC() = default; bool InitRateControl(const VP8RateControlRtcConfig &cfg); - struct VP8_COMP *cpi_; - int q_; + struct VP8_COMP *cpi_ = nullptr; + int q_ = -1; }; } // namespace libvpx diff --git a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h index 1cfc12f6fa..4c8fcf6989 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h +++ b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h @@ -13,7 +13,6 @@ #include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" -#include "vpx_util/vpx_thread.h" #include "./vp9_rtcd.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_loopfilter.h" diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c index 37762ca15a..1a93b97e56 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c +++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c @@ -12,4 +12,4 @@ #include "./vp9_rtcd.h" #include "vpx_ports/vpx_once.h" -void vp9_rtcd() { once(setup_rtcd_internal); } +void vp9_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl index 3ecbd5417f..af3ff0e980 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -129,7 +129,7 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") { add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; -specialize qw/vp9_block_error_fp neon avx2 sse2/; +specialize qw/vp9_block_error_fp neon sve avx2 sse2/; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order"; specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/; @@ -138,12 +138,12 @@ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - specialize qw/vp9_block_error neon avx2 sse2/; + specialize qw/vp9_block_error neon sve avx2 sse2/; add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error neon sse2/; } else { - specialize qw/vp9_block_error neon avx2 msa sse2/; + specialize qw/vp9_block_error neon sve avx2 msa sse2/; } # fdct functions diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c index 8df18af3b8..24adbcbff0 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c +++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_pthread.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_reconinter.h" diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h index 5df0117f12..96c705d0d5 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h +++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h @@ -12,6 +12,7 @@ #define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ #include "./vpx_config.h" #include "vp9/common/vp9_loopfilter.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #ifdef __cplusplus diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c index c5892156f4..4fe680cefc 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c @@ -22,6 +22,7 @@ #include "vpx_ports/mem.h" #include "vpx_ports/mem_ops.h" #include "vpx_scale/vpx_scale.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vpx_util/vpx_debug_util.h" @@ -2292,6 +2293,7 @@ static INLINE void init_mt(VP9Decoder *pbi) { ++pbi->num_tile_workers; winterface->init(worker); + worker->thread_name = "vpx tile worker"; if (n < num_threads - 1 && !winterface->reset(worker)) { do { winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]); diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c index 5a7e9f9ab3..5c77df5002 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c @@ -21,6 +21,7 @@ #include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" #include "vpx_scale/vpx_scale.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #include "vp9/common/vp9_alloccommon.h" @@ -210,6 +211,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) { cm->error.setjmp = 0; vpx_get_worker_interface()->init(&pbi->lf_worker); + pbi->lf_worker.thread_name = "vpx lf worker"; return pbi; } diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h index 2e198d552e..b3ee4eab5f 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h @@ -16,6 +16,7 @@ #include "vpx/vpx_codec.h" #include "vpx_dsp/bitreader.h" #include "vpx_scale/yv12config.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #include "vp9/common/vp9_thread_common.h" diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c index 9a31f5a6d0..926ae87739 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c +++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c @@ -12,6 +12,7 @@ #include <string.h> #include "vpx/vpx_integer.h" +#include "vpx_util/vpx_pthread.h" #include "vp9/decoder/vp9_job_queue.h" diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h index bc23bf9c2c..59f71fb9ba 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h +++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h @@ -11,7 +11,7 @@ #ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ #define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ -#include "vpx_util/vpx_thread.h" +#include "vpx_util/vpx_pthread.h" typedef struct { // Pointer to buffer base which contains the jobs diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c new file mode 100644 index 0000000000..78e7361d85 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" + +int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int64x2_t err_v = vdupq_n_s64(0); + int64x2_t ssz_v = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const int16x8_t diff0 = vabdq_s16(c0, d0); + const int16x8_t diff1 = vabdq_s16(c1, d1); + + err_v = vpx_dotq_s16(err_v, diff0, diff0); + err_v = vpx_dotq_s16(err_v, diff1, diff1); + + ssz_v = vpx_dotq_s16(ssz_v, c0, c0); + ssz_v = vpx_dotq_s16(ssz_v, c1, c1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_int64x2(ssz_v); + return horizontal_add_int64x2(err_v); +} + +int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + int64x2_t err = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const int16x8_t diff0 = vabdq_s16(c0, d0); + const int16x8_t diff1 = vabdq_s16(c1, d1); + + err = vpx_dotq_s16(err, diff0, diff0); + err = vpx_dotq_s16(err, diff1, diff1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return horizontal_add_int64x2(err); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/libvpx/vp9/encoder/vp9_block.h index 7fa00cd194..6542794667 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_block.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_block.h @@ -11,8 +11,6 @@ #ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_ #define VPX_VP9_ENCODER_VP9_BLOCK_H_ -#include "vpx_util/vpx_thread.h" - #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c index 42073f756c..ee0fcd8729 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c @@ -119,8 +119,8 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { PC_TREE *const tree = &td->pc_tree[pc_tree_index]; tree->block_size = square[0]; alloc_tree_contexts(cm, tree, 4); - tree->leaf_split[0] = this_leaf++; - for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0]; + tree->u.leaf_split[0] = this_leaf++; + for (j = 1; j < 4; j++) tree->u.leaf_split[j] = tree->u.leaf_split[0]; } // Each node has 4 leaf nodes, fill each block_size level of the tree @@ -130,7 +130,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { PC_TREE *const tree = &td->pc_tree[pc_tree_index]; alloc_tree_contexts(cm, tree, 4 << (2 * square_index)); tree->block_size = square[square_index]; - for (j = 0; j < 4; j++) tree->split[j] = this_pc++; + for (j = 0; j < 4; j++) tree->u.split[j] = this_pc++; ++pc_tree_index; } ++square_index; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h index 4e301cc17d..51e13ba654 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h @@ -90,7 +90,7 @@ typedef struct PC_TREE { union { struct PC_TREE *split[4]; PICK_MODE_CONTEXT *leaf_split[4]; - }; + } u; // Obtained from a simple motion search. Used by the ML based partition search // speed feature. MV mv; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c index 46291f4868..b24c85f406 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c @@ -21,7 +21,7 @@ #include "vpx_ports/mem.h" #include "vpx_ports/vpx_timer.h" #include "vpx_ports/system_state.h" - +#include "vpx_util/vpx_pthread.h" #if CONFIG_MISMATCH_DEBUG #include "vpx_util/vpx_debug_util.h" #endif // CONFIG_MISMATCH_DEBUG @@ -2303,16 +2303,16 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile, assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize, - pc_tree->leaf_split[0]); + pc_tree->u.leaf_split[0]); } else { encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, - pc_tree->split[0]); + pc_tree->u.split[0]); encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled, - subsize, pc_tree->split[1]); + subsize, pc_tree->u.split[1]); encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled, - subsize, pc_tree->split[2]); + subsize, pc_tree->u.split[2]); encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, - subsize, pc_tree->split[3]); + subsize, pc_tree->u.split[3]); } break; } @@ -2645,13 +2645,13 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td, assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, - pc_tree->split[0]); + pc_tree->u.split[0]); encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled, - subsize, pc_tree->split[1]); + subsize, pc_tree->u.split[1]); encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled, - subsize, pc_tree->split[2]); + subsize, pc_tree->u.split[2]); encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, - output_enabled, subsize, pc_tree->split[3]); + output_enabled, subsize, pc_tree->u.split[3]); break; } @@ -2801,7 +2801,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - subsize, pc_tree->leaf_split[0], INT_MAX, INT64_MAX); + subsize, pc_tree->u.leaf_split[0], INT_MAX, INT64_MAX); break; } last_part_rdc.rate = 0; @@ -2819,7 +2819,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, &tmp_rdc.dist, i != 3, - pc_tree->split[i]); + pc_tree->u.split[i]); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { vp9_rd_cost_reset(&last_part_rdc); break; @@ -2860,9 +2860,9 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, continue; save_context(x, mi_row, mi_col, a, l, sa, sl, bsize); - pc_tree->split[i]->partitioning = PARTITION_NONE; + pc_tree->u.split[i]->partitioning = PARTITION_NONE; rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, - &tmp_rdc, split_subsize, &pc_tree->split[i]->none, + &tmp_rdc, split_subsize, &pc_tree->u.split[i]->none, INT_MAX, INT64_MAX); restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); @@ -2877,7 +2877,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, if (i != 3) encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0, - split_subsize, pc_tree->split[i]); + split_subsize, pc_tree->u.split[i]); pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx, split_subsize); @@ -3391,7 +3391,7 @@ static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x, features[feature_index++] = VPXMIN(rd_ratio, 2.0f); for (i = 0; i < 4; ++i) { - const int64_t this_rd = pc_tree->split[i]->none.rdcost; + const int64_t this_rd = pc_tree->u.split[i]->none.rdcost; const int rd_valid = this_rd > 0 && this_rd < 1000000000; // Ratio between sub-block RD and whole block RD. features[feature_index++] = @@ -3958,19 +3958,19 @@ static void store_superblock_info( } // recursively traverse partition tree when partition is split. assert(pc_tree->partitioning == PARTITION_SPLIT); - store_superblock_info(pc_tree->split[0], mi_grid_visible, mi_stride, + store_superblock_info(pc_tree->u.split[0], mi_grid_visible, mi_stride, subblock_square_size_4x4, num_unit_rows, num_unit_cols, row_start_4x4, col_start_4x4, partition_info, motion_vector_info); - store_superblock_info(pc_tree->split[1], mi_grid_visible, mi_stride, + store_superblock_info(pc_tree->u.split[1], mi_grid_visible, mi_stride, subblock_square_size_4x4, num_unit_rows, num_unit_cols, row_start_4x4, col_start_4x4 + subblock_square_size_4x4, partition_info, motion_vector_info); - store_superblock_info(pc_tree->split[2], mi_grid_visible, mi_stride, + store_superblock_info(pc_tree->u.split[2], mi_grid_visible, mi_stride, subblock_square_size_4x4, num_unit_rows, num_unit_cols, row_start_4x4 + subblock_square_size_4x4, col_start_4x4, partition_info, motion_vector_info); - store_superblock_info(pc_tree->split[3], mi_grid_visible, mi_stride, + store_superblock_info(pc_tree->u.split[3], mi_grid_visible, mi_stride, subblock_square_size_4x4, num_unit_rows, num_unit_cols, row_start_4x4 + subblock_square_size_4x4, col_start_4x4 + subblock_square_size_4x4, @@ -4114,7 +4114,7 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, vp9_zero(pc_tree->mv); } if (bsize > BLOCK_8X8) { // Store MV result as reference for subblocks. - for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv; + for (i = 0; i < 4; ++i) pc_tree->u.split[i]->mv = pc_tree->mv; } } @@ -4199,25 +4199,25 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_SPLIT // TODO(jingning): use the motion vectors given by the above search as // the starting point of motion search in the following partition type check. - pc_tree->split[0]->none.rdcost = 0; - pc_tree->split[1]->none.rdcost = 0; - pc_tree->split[2]->none.rdcost = 0; - pc_tree->split[3]->none.rdcost = 0; + pc_tree->u.split[0]->none.rdcost = 0; + pc_tree->u.split[1]->none.rdcost = 0; + pc_tree->u.split[2]->none.rdcost = 0; + pc_tree->u.split[3]->none.rdcost = 0; if (do_split || must_split) { subsize = get_subsize(bsize, PARTITION_SPLIT); load_pred_mv(x, ctx); if (bsize == BLOCK_8X8) { i = 4; if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) - pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter; + pc_tree->u.leaf_split[0]->pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, - pc_tree->leaf_split[0], best_rdc.rate, best_rdc.dist); + pc_tree->u.leaf_split[0], best_rdc.rate, best_rdc.dist); if (sum_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; } else { if (cpi->sf.prune_ref_frame_for_rect_partitions) { - const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0]; - const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1]; + const int ref1 = pc_tree->u.leaf_split[0]->mic.ref_frame[0]; + const int ref2 = pc_tree->u.leaf_split[0]->mic.ref_frame[1]; for (i = 0; i < 4; ++i) { ref_frames_used[i] |= (1 << ref1); if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); @@ -4250,21 +4250,21 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; - pc_tree->split[i]->index = i; + pc_tree->u.split[i]->index = i; if (cpi->sf.prune_ref_frame_for_rect_partitions) - pc_tree->split[i]->none.rate = INT_MAX; + pc_tree->u.split[i]->none.rate = INT_MAX; found_best_rd = rd_pick_partition( cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, - &this_rdc, best_rdc_split, pc_tree->split[i]); + &this_rdc, best_rdc_split, pc_tree->u.split[i]); if (found_best_rd == 0) { sum_rdc.rdcost = INT64_MAX; break; } else { if (cpi->sf.prune_ref_frame_for_rect_partitions && - pc_tree->split[i]->none.rate != INT_MAX) { - const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0]; - const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1]; + pc_tree->u.split[i]->none.rate != INT_MAX) { + const int ref1 = pc_tree->u.split[i]->none.mic.ref_frame[0]; + const int ref2 = pc_tree->u.split[i]->none.mic.ref_frame[1]; ref_frames_used[i] |= (1 << ref1); if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); } @@ -4821,13 +4821,13 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row, } break; case PARTITION_SPLIT: { - fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]); + fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->u.split[0]); fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize, - pc_tree->split[1]); + pc_tree->u.split[1]); fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize, - pc_tree->split[2]); + pc_tree->u.split[2]); fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize, - pc_tree->split[3]); + pc_tree->u.split[3]); break; } default: break; @@ -4845,7 +4845,8 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) { if (bsize > BLOCK_8X8) { BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); int i; - for (i = 0; i < 4; ++i) pred_pixel_ready_reset(pc_tree->split[i], subsize); + for (i = 0; i < 4; ++i) + pred_pixel_ready_reset(pc_tree->u.split[i], subsize); } } @@ -5046,9 +5047,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; load_pred_mv(x, ctx); - nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, - mi_col + x_idx, subsize, &this_rdc, 0, - best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]); + nonrd_pick_partition( + cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &this_rdc, 0, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->u.split[i]); if (this_rdc.rate == INT_MAX) { vp9_rd_cost_reset(&sum_rdc); @@ -5281,10 +5282,10 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, subsize = get_subsize(bsize, PARTITION_SPLIT); nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize, output_enabled, rd_cost, - pc_tree->split[0]); + pc_tree->u.split[0]); nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp, mi_row, mi_col + hbs, subsize, output_enabled, &this_rdc, - pc_tree->split[1]); + pc_tree->u.split[1]); if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { rd_cost->rate += this_rdc.rate; @@ -5292,7 +5293,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, } nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp, mi_row + hbs, mi_col, subsize, output_enabled, - &this_rdc, pc_tree->split[2]); + &this_rdc, pc_tree->u.split[2]); if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { rd_cost->rate += this_rdc.rate; @@ -5300,7 +5301,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, } nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp, mi_row + hbs, mi_col + hbs, subsize, - output_enabled, &this_rdc, pc_tree->split[3]); + output_enabled, &this_rdc, pc_tree->u.split[3]); if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { rd_cost->rate += this_rdc.rate; @@ -5400,21 +5401,21 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, subsize = get_subsize(bsize, PARTITION_SPLIT); if (bsize == BLOCK_8X8) { nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, - subsize, pc_tree->leaf_split[0]); + subsize, pc_tree->u.leaf_split[0]); encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, - subsize, pc_tree->leaf_split[0]); + subsize, pc_tree->u.leaf_split[0]); } else { nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize, - output_enabled, dummy_cost, pc_tree->split[0]); + output_enabled, dummy_cost, pc_tree->u.split[0]); nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, mi_row, mi_col + hbs, subsize, output_enabled, dummy_cost, - pc_tree->split[1]); + pc_tree->u.split[1]); nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp, mi_row + hbs, mi_col, subsize, output_enabled, - dummy_cost, pc_tree->split[2]); + dummy_cost, pc_tree->u.split[2]); nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp, mi_row + hbs, mi_col + hbs, subsize, output_enabled, - dummy_cost, pc_tree->split[3]); + dummy_cost, pc_tree->u.split[3]); } break; } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c index fd213f1e6b..3b8b5345f1 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c @@ -31,12 +31,14 @@ #include "vpx_ports/system_state.h" #include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" +#include "vpx_util/vpx_pthread.h" #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vpx_util/vpx_debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_enums.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_idct.h" #if CONFIG_VP9_POSTPROC @@ -2135,24 +2137,22 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cpi->external_resize = 1; } - if (cpi->initial_width) { - int new_mi_size = 0; - vp9_set_mb_mi(cm, cm->width, cm->height); - new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); - if (cm->mi_alloc_size < new_mi_size) { - vp9_free_context_buffers(cm); - vp9_free_pc_tree(&cpi->td); - vpx_free(cpi->mbmi_ext_base); - alloc_compressor_data(cpi); - realloc_segmentation_maps(cpi); - cpi->initial_width = cpi->initial_height = 0; - cpi->external_resize = 0; - } else if (cm->mi_alloc_size == new_mi_size && - (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) { - if (vp9_alloc_loop_filter(cm)) { - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate loop filter data"); - } + int new_mi_size = 0; + vp9_set_mb_mi(cm, cm->width, cm->height); + new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); + if (cm->mi_alloc_size < new_mi_size) { + vp9_free_context_buffers(cm); + vp9_free_pc_tree(&cpi->td); + vpx_free(cpi->mbmi_ext_base); + alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->initial_width = cpi->initial_height = 0; + cpi->external_resize = 0; + } else if (cm->mi_alloc_size == new_mi_size && + (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) { + if (vp9_alloc_loop_filter(cm)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate loop filter data"); } } @@ -3472,7 +3472,6 @@ void vp9_scale_references(VP9_COMP *cpi) { continue; } -#if CONFIG_VP9_HIGHBITDEPTH if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { RefCntBuffer *new_fb_ptr = NULL; int force_scaling = 0; @@ -3485,6 +3484,7 @@ void vp9_scale_references(VP9_COMP *cpi) { new_fb_ptr = &pool->frame_bufs[new_fb]; if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || new_fb_ptr->buf.y_crop_height != cm->height) { +#if CONFIG_VP9_HIGHBITDEPTH if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, cm->use_highbitdepth, @@ -3494,22 +3494,7 @@ void vp9_scale_references(VP9_COMP *cpi) { "Failed to allocate frame buffer"); scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, EIGHTTAP, 0); - cpi->scaled_ref_idx[ref_frame - 1] = new_fb; - alloc_frame_mvs(cm, new_fb); - } #else - if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { - RefCntBuffer *new_fb_ptr = NULL; - int force_scaling = 0; - int new_fb = cpi->scaled_ref_idx[ref_frame - 1]; - if (new_fb == INVALID_IDX) { - new_fb = get_free_fb(cm); - force_scaling = 1; - } - if (new_fb == INVALID_IDX) return; - new_fb_ptr = &pool->frame_bufs[new_fb]; - if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || - new_fb_ptr->buf.y_crop_height != cm->height) { if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9_ENC_BORDER_IN_PIXELS, @@ -3517,10 +3502,10 @@ void vp9_scale_references(VP9_COMP *cpi) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0); +#endif // CONFIG_VP9_HIGHBITDEPTH cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } -#endif // CONFIG_VP9_HIGHBITDEPTH } else { int buf_idx; RefCntBuffer *buf = NULL; @@ -3958,6 +3943,35 @@ static INLINE void set_raw_source_frame(VP9_COMP *cpi) { #endif } +static YV12_BUFFER_CONFIG *svc_twostage_scale( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) { + if (cm->mi_cols * MI_SIZE != unscaled->y_width || + cm->mi_rows * MI_SIZE != unscaled->y_height) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->bit_depth == VPX_BITS_8) { + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, + phase_scaler); + } else { + scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth, + filter_type2, phase_scaler2); + scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth, + filter_type, phase_scaler); + } +#else + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler); +#endif // CONFIG_VP9_HIGHBITDEPTH + return scaled; + } else { + return unscaled; + } +} + static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest) { VP9_COMMON *const cm = &cpi->common; @@ -4000,7 +4014,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // result will be saved in scaled_temp and might be used later. const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1]; const int phase_scaler2 = svc->downsample_filter_phase[1]; - cpi->Source = vp9_svc_twostage_scale( + cpi->Source = svc_twostage_scale( cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp, filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); svc->scaled_one_half = 1; @@ -4486,21 +4500,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest // external rate control model. // This flag doesn't have any impact when external rate control is not used. int ext_rc_recode = 0; - // Maximal frame size allowed by the external rate control. - // case: 0, we ignore the max frame size limit, and encode with the qindex - // passed in by the external rate control model. - // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex - // and may recode if undershoot/overshoot is seen. - // If the external qindex is not VPX_DEFAULT_Q, we force no recode. - // case: -1, we take libvpx's decision for the max frame size, as well as - // the recode decision. - // Otherwise: if a specific size is given, libvpx's recode decision - // will respect the given size. - int ext_rc_max_frame_size = 0; - // Use VP9's decision of qindex. This flag is in use only in external rate - // control model to help determine whether to recode when - // |ext_rc_max_frame_size| is 0. - int ext_rc_use_default_q = 1; const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth; #if CONFIG_RATE_CTRL @@ -4616,27 +4615,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } #endif // CONFIG_RATE_CTRL if (cpi->ext_ratectrl.ready && !ext_rc_recode && + !cpi->tpl_with_external_rc && (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; vpx_rc_encodeframe_decision_t encode_frame_decision; - FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; - const int ref_frame_flags = get_ref_frame_flags(cpi); - RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; - const RefCntBuffer *curr_frame_buf = - get_ref_cnt_buffer(cm, cm->new_fb_idx); - // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. - // index 1 refers to the first encoding frame in a gf group. - // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. - // See function define_gf_group_structure(). - const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; - get_ref_frame_bufs(cpi, ref_frame_bufs); codec_status = vp9_extrc_get_encodeframe_decision( - &cpi->ext_ratectrl, curr_frame_buf->frame_index, - cm->current_frame_coding_index, gf_group->index, update_type, - gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags, - &encode_frame_decision); + &cpi->ext_ratectrl, gf_group->index, &encode_frame_decision); if (codec_status != VPX_CODEC_OK) { vpx_internal_error(&cm->error, codec_status, "vp9_extrc_get_encodeframe_decision() failed"); @@ -4645,9 +4631,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest // libvpx's default q. if (encode_frame_decision.q_index != VPX_DEFAULT_Q) { q = encode_frame_decision.q_index; - ext_rc_use_default_q = 0; } - ext_rc_max_frame_size = encode_frame_decision.max_frame_size; } vp9_set_quantizer(cpi, q); @@ -4690,21 +4674,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (cpi->ext_ratectrl.ready && (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { - // In general, for the external rate control, we take the qindex provided - // as input and encode the frame with this qindex faithfully. However, - // in some extreme scenarios, the provided qindex leads to a massive - // overshoot of frame size. In this case, we fall back to VP9's decision - // to pick a new qindex and recode the frame. We return the new qindex - // through the API to the external model. - if (ext_rc_max_frame_size == 0) { - if (!ext_rc_use_default_q) break; - } else if (ext_rc_max_frame_size == -1) { - // Do nothing, fall back to libvpx's recode decision. - } else { - // Change the max frame size, used in libvpx's recode decision. - rc->max_frame_bandwidth = ext_rc_max_frame_size; - } - ext_rc_recode = 1; + break; } #if CONFIG_RATE_CTRL if (cpi->oxcf.use_simple_encode_api) { @@ -4974,35 +4944,6 @@ static void set_ext_overrides(VP9_COMP *cpi) { } } -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( - VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, - int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) { - if (cm->mi_cols * MI_SIZE != unscaled->y_width || - cm->mi_rows * MI_SIZE != unscaled->y_height) { -#if CONFIG_VP9_HIGHBITDEPTH - if (cm->bit_depth == VPX_BITS_8) { - vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, - phase_scaler2); - vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, - phase_scaler); - } else { - scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth, - filter_type2, phase_scaler2); - scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth, - filter_type, phase_scaler); - } -#else - vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, - phase_scaler2); - vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler); -#endif // CONFIG_VP9_HIGHBITDEPTH - return scaled; - } else { - return unscaled; - } -} - YV12_BUFFER_CONFIG *vp9_scale_if_required( VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) { @@ -6429,7 +6370,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } if (arf_src_index) { - assert(arf_src_index <= rc->frames_to_key); + if (!(cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL)) { + // This assert only makes sense when not using external RC. + assert(arf_src_index <= rc->frames_to_key); + } if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { cpi->alt_ref_source = source; @@ -6617,7 +6563,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE && cpi->sf.enable_tpl_model) { vp9_init_tpl_buffer(cpi); - vp9_estimate_qp_gop(cpi); + vp9_estimate_tpl_qp_gop(cpi); vp9_setup_tpl_stats(cpi); } #if CONFIG_COLLECT_COMPONENT_TIMING diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h index 91df538821..898855d10d 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h @@ -25,6 +25,7 @@ #include "vpx_dsp/variance.h" #include "vpx_dsp/psnr.h" #include "vpx_ports/system_state.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #include "vpx_util/vpx_timestamp.h" @@ -1062,7 +1063,7 @@ typedef struct VP9_COMP { */ uint64_t frame_component_time[kTimingComponents]; #endif - // Flag to indicate if QP and GOP for TPL is controlled by external RC. + // Flag to indicate if QP and GOP for TPL are controlled by external RC. int tpl_with_external_rc; } VP9_COMP; @@ -1395,11 +1396,6 @@ void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); #endif // CONFIG_VP9_HIGHBITDEPTH -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale( - VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, - int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2); - YV12_BUFFER_CONFIG *vp9_scale_if_required( VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler); diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c index a8d1cb7a7a..c3b79507e6 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c @@ -17,6 +17,7 @@ #include "vp9/encoder/vp9_multi_thread.h" #include "vp9/encoder/vp9_temporal_filter.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_util/vpx_pthread.h" static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { int i, j, k, l, m, n; @@ -55,7 +56,7 @@ static int enc_worker_hook(void *arg1, void *unused) { vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col); } - return 0; + return 1; } static int get_max_tile_cols(VP9_COMP *cpi) { @@ -106,6 +107,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) { ++cpi->num_workers; winterface->init(worker); + worker->thread_name = "vpx enc worker"; if (i < num_workers - 1) { thread_data->cpi = cpi; @@ -204,8 +206,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { create_enc_workers(cpi, num_workers); for (i = 0; i < num_workers; i++) { - EncWorkerData *thread_data; - thread_data = &cpi->tile_thr_data[i]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { @@ -456,7 +457,7 @@ static int first_pass_worker_hook(void *arg1, void *arg2) { this_tile, &best_ref_mv, mb_row); } } - return 0; + return 1; } void vp9_encode_fp_row_mt(VP9_COMP *cpi) { @@ -543,7 +544,7 @@ static int temporal_filter_worker_hook(void *arg1, void *arg2) { mb_col_start, mb_col_end); } } - return 0; + return 1; } void vp9_temporal_filter_row_mt(VP9_COMP *cpi) { @@ -616,7 +617,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *arg2) { vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row); } } - return 0; + return 1; } void vp9_encode_tiles_row_mt(VP9_COMP *cpi) { diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h index 4c192da515..359cdd1290 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h @@ -11,13 +11,14 @@ #ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_ #define VPX_VP9_ENCODER_VP9_ETHREAD_H_ +#include "vpx_util/vpx_pthread.h" + #ifdef __cplusplus extern "C" { #endif #define MAX_NUM_TILE_COLS (1 << 6) #define MAX_NUM_TILE_ROWS 4 -#define MAX_NUM_THREADS 80 struct VP9_COMP; struct ThreadData; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c index 4664e8c5e2..7b0d89acd2 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c @@ -156,32 +156,15 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { } vpx_codec_err_t vp9_extrc_get_encodeframe_decision( - EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, - FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, - RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + EXT_RATECTRL *ext_ratectrl, int gop_index, vpx_rc_encodeframe_decision_t *encode_frame_decision) { - if (ext_ratectrl == NULL) { - return VPX_CODEC_INVALID_PARAM; - } - if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) { - vpx_rc_status_t rc_status; - vpx_rc_encodeframe_info_t encode_frame_info; - encode_frame_info.show_index = show_index; - encode_frame_info.coding_index = coding_index; - encode_frame_info.gop_index = gop_index; - encode_frame_info.frame_type = extrc_get_frame_type(update_type); - encode_frame_info.gop_size = gop_size; - encode_frame_info.use_alt_ref = use_alt_ref; - - vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, - encode_frame_info.ref_frame_coding_indexes, - encode_frame_info.ref_frame_valid_list); + assert(ext_ratectrl != NULL); + assert(ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0); - rc_status = ext_ratectrl->funcs.get_encodeframe_decision( - ext_ratectrl->model, &encode_frame_info, encode_frame_decision); - if (rc_status == VPX_RC_ERROR) { - return VPX_CODEC_ERROR; - } + vpx_rc_status_t rc_status = ext_ratectrl->funcs.get_encodeframe_decision( + ext_ratectrl->model, gop_index, encode_frame_decision); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; } return VPX_CODEC_OK; } @@ -222,29 +205,14 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( } vpx_codec_err_t vp9_extrc_get_gop_decision( - EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, - vpx_rc_gop_decision_t *gop_decision) { + EXT_RATECTRL *ext_ratectrl, vpx_rc_gop_decision_t *gop_decision) { vpx_rc_status_t rc_status; if (ext_ratectrl == NULL || !ext_ratectrl->ready || (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) { return VPX_CODEC_INVALID_PARAM; } - rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, - gop_info, gop_decision); - if (gop_decision->use_alt_ref) { - const int arf_constraint = - gop_decision->gop_coding_frames >= gop_info->min_gf_interval && - gop_decision->gop_coding_frames < gop_info->lag_in_frames; - if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR; - } - // TODO(chengchen): Take min and max gf interval from the model - // and overwrite libvpx's decision so that we can get rid - // of one of the checks here. - if (gop_decision->gop_coding_frames > gop_info->frames_to_key || - gop_decision->gop_coding_frames - gop_decision->use_alt_ref > - gop_info->max_gf_interval) { - return VPX_CODEC_ERROR; - } + rc_status = + ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, gop_decision); if (rc_status == VPX_RC_ERROR) { return VPX_CODEC_ERROR; } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h index b04580c1d4..d1be5f2aef 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h @@ -39,9 +39,7 @@ vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl, const VpxTplGopStats *tpl_gop_stats); vpx_codec_err_t vp9_extrc_get_encodeframe_decision( - EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, - FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, - RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + EXT_RATECTRL *ext_ratectrl, int gop_index, vpx_rc_encodeframe_decision_t *encode_frame_decision); vpx_codec_err_t vp9_extrc_update_encodeframe_result( @@ -50,9 +48,8 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, uint32_t input_bit_depth, const int actual_encoding_qindex); -vpx_codec_err_t vp9_extrc_get_gop_decision( - EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, - vpx_rc_gop_decision_t *gop_decision); +vpx_codec_err_t vp9_extrc_get_gop_decision(EXT_RATECTRL *ext_ratectrl, + vpx_rc_gop_decision_t *gop_decision); vpx_codec_err_t vp9_extrc_get_frame_rdmult( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c index dcb62e8768..69261ac65f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c @@ -162,42 +162,3 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, dst->uv_stride, src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv, chroma_step); } - -void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int srcy, - int srcx, int srch, int srcw) { - // If the side is not touching the bounder then don't extend. - const int et_y = srcy ? 0 : dst->border; - const int el_y = srcx ? 0 : dst->border; - const int eb_y = srcy + srch != src->y_height - ? 0 - : dst->border + dst->y_height - src->y_height; - const int er_y = srcx + srcw != src->y_width - ? 0 - : dst->border + dst->y_width - src->y_width; - const int src_y_offset = srcy * src->y_stride + srcx; - const int dst_y_offset = srcy * dst->y_stride + srcx; - - const int et_uv = ROUND_POWER_OF_TWO(et_y, 1); - const int el_uv = ROUND_POWER_OF_TWO(el_y, 1); - const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1); - const int er_uv = ROUND_POWER_OF_TWO(er_y, 1); - const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); - const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); - const int srch_uv = ROUND_POWER_OF_TWO(srch, 1); - const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1); - // detect nv12 colorspace - const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1; - - copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, - dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch, - et_y, el_y, eb_y, er_y, 1); - - copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride, - dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv, - srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step); - - copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride, - dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv, - srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step); -} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h index 4ba7fc95e3..21d7e68b9f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h @@ -21,9 +21,6 @@ extern "C" { void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); -void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int srcy, - int srcx, int srch, int srcw); #ifdef __cplusplus } // extern "C" #endif diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c index a9cdf5353f..58b9b7ba61 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c @@ -37,6 +37,7 @@ #include "vp9/encoder/vp9_mcomp.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rd.h" +#include "vpx/vpx_ext_ratectrl.h" #include "vpx_dsp/variance.h" #define OUTPUT_FPF 0 @@ -1164,7 +1165,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, v_fn_ptr.vf = get_block_variance_fn(bsize); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8); + v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd); } #endif // CONFIG_VP9_HIGHBITDEPTH this_motion_error = @@ -2769,38 +2770,6 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { } } #endif - // If the external rate control model for GOP is used, the gop decisions - // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref| - // will be overwritten. - if (cpi->ext_ratectrl.ready && - (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && - cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) { - vpx_codec_err_t codec_status; - vpx_rc_gop_decision_t gop_decision; - vpx_rc_gop_info_t gop_info; - gop_info.min_gf_interval = rc->min_gf_interval; - gop_info.max_gf_interval = rc->max_gf_interval; - gop_info.active_min_gf_interval = active_gf_interval.min; - gop_info.active_max_gf_interval = active_gf_interval.max; - gop_info.allow_alt_ref = allow_alt_ref; - gop_info.is_key_frame = is_key_frame; - gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active; - gop_info.frames_since_key = rc->frames_since_key; - gop_info.frames_to_key = rc->frames_to_key; - gop_info.lag_in_frames = cpi->oxcf.lag_in_frames; - gop_info.show_index = cm->current_video_frame; - gop_info.coding_index = cm->current_frame_coding_index; - gop_info.gop_global_index = rc->gop_global_index; - - codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info, - &gop_decision); - if (codec_status != VPX_CODEC_OK) { - vpx_internal_error(&cm->error, codec_status, - "vp9_extrc_get_gop_decision() failed"); - } - gop_coding_frames = gop_decision.gop_coding_frames; - use_alt_ref = gop_decision.use_alt_ref; - } // Was the group length constrained by the requirement for a new KF? rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0; @@ -3600,32 +3569,71 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { else twopass->fr_content_type = FC_NORMAL; - // Keyframe and section processing. - if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) { - // Define next KF group and assign bits to it. - find_next_key_frame(cpi, show_idx); + // If the external rate control model for GOP is used, the gop decisions + // are overwritten, including whether to use key frame in this GF group, + // GF group length, and whether to use arf. + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL && + rc->frames_till_gf_update_due == 0) { + vpx_codec_err_t codec_status; + vpx_rc_gop_decision_t gop_decision; + codec_status = + vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_gop_decision() failed"); + } + if (gop_decision.use_key_frame) { + cpi->common.frame_type = KEY_FRAME; + rc->frames_since_key = 0; + // Clear the alt ref active flag and last group multi arf flags as they + // can never be set for a key frame. + rc->source_alt_ref_active = 0; + // KF is always a GF so clear frames till next gf counter. + rc->frames_till_gf_update_due = 0; + } + + // A new GF group + if (rc->frames_till_gf_update_due == 0) { + vp9_zero(twopass->gf_group); + ++rc->gop_global_index; + if (gop_decision.use_alt_ref) { + rc->source_alt_ref_pending = 1; + } + rc->baseline_gf_interval = + gop_decision.gop_coding_frames - rc->source_alt_ref_pending; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + define_gf_group_structure(cpi); + } } else { - cm->frame_type = INTER_FRAME; - } + // Keyframe and section processing. + if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) { + // Define next KF group and assign bits to it. + find_next_key_frame(cpi, show_idx); + } else { + cm->frame_type = INTER_FRAME; + } - // Define a new GF/ARF group. (Should always enter here for key frames). - if (rc->frames_till_gf_update_due == 0) { - define_gf_group(cpi, show_idx); + // Define a new GF/ARF group. (Should always enter here for key frames). + if (rc->frames_till_gf_update_due == 0) { + define_gf_group(cpi, show_idx); - rc->frames_till_gf_update_due = rc->baseline_gf_interval; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; #if ARF_STATS_OUTPUT - { - FILE *fpfile; - fpfile = fopen("arf.stt", "a"); - ++arf_count; - fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n", - cm->current_video_frame, rc->frames_till_gf_update_due, - rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type); - - fclose(fpfile); - } + { + FILE *fpfile; + fpfile = fopen("arf.stt", "a"); + ++arf_count; + fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n", + cm->current_video_frame, rc->frames_till_gf_update_due, + rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type); + + fclose(fpfile); + } #endif + } } vp9_configure_buffer_updates(cpi, gf_group->index); diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c index 97838c38e6..b6be4f88ac 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c @@ -9,6 +9,7 @@ */ #include <assert.h> #include <stdlib.h> +#include <string.h> #include "./vpx_config.h" @@ -81,7 +82,6 @@ bail: return NULL; } -#define USE_PARTIAL_COPY 0 int vp9_lookahead_full(const struct lookahead_ctx *ctx) { return ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz; } @@ -94,11 +94,6 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, int64_t ts_start, int64_t ts_end, int use_highbitdepth, vpx_enc_frame_flags_t flags) { struct lookahead_entry *buf; -#if USE_PARTIAL_COPY - int row, col, active_end; - int mb_rows = (src->y_height + 15) >> 4; - int mb_cols = (src->y_width + 15) >> 4; -#endif int width = src->y_crop_width; int height = src->y_crop_height; int uv_width = src->uv_crop_width; @@ -119,76 +114,36 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, height != buf->img.y_crop_height || uv_width != buf->img.uv_crop_width || uv_height != buf->img.uv_crop_height; - larger_dimensions = width > buf->img.y_width || height > buf->img.y_height || - uv_width > buf->img.uv_width || - uv_height > buf->img.uv_height; + larger_dimensions = + width > buf->img.y_crop_width || height > buf->img.y_crop_height || + uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height; assert(!larger_dimensions || new_dimensions); -#if USE_PARTIAL_COPY - // TODO(jkoleszar): This is disabled for now, as - // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware. - - // Only do this partial copy if the following conditions are all met: - // 1. Lookahead queue has has size of 1. - // 2. Active map is provided. - // 3. This is not a key frame, golden nor altref frame. - if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) { - for (row = 0; row < mb_rows; ++row) { - col = 0; - - while (1) { - // Find the first active macroblock in this row. - for (; col < mb_cols; ++col) { - if (active_map[col]) break; - } - - // No more active macroblock in this row. - if (col == mb_cols) break; - - // Find the end of active region in this row. - active_end = col; - - for (; active_end < mb_cols; ++active_end) { - if (!active_map[active_end]) break; - } - - // Only copy this active region. - vp9_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4, - 16, (active_end - col) << 4); - - // Start again from the end of this active region. - col = active_end; - } - - active_map += mb_cols; - } - } else { -#endif - if (larger_dimensions) { - YV12_BUFFER_CONFIG new_img; - memset(&new_img, 0, sizeof(new_img)); - if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x, - subsampling_y, + if (larger_dimensions) { + YV12_BUFFER_CONFIG new_img; + memset(&new_img, 0, sizeof(new_img)); + if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x, + subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH - use_highbitdepth, + use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, 0)) - return 1; - vpx_free_frame_buffer(&buf->img); - buf->img = new_img; - } else if (new_dimensions) { - buf->img.y_crop_width = src->y_crop_width; - buf->img.y_crop_height = src->y_crop_height; - buf->img.uv_crop_width = src->uv_crop_width; - buf->img.uv_crop_height = src->uv_crop_height; - buf->img.subsampling_x = src->subsampling_x; - buf->img.subsampling_y = src->subsampling_y; - } - // Partial copy not implemented yet - vp9_copy_and_extend_frame(src, &buf->img); -#if USE_PARTIAL_COPY + VP9_ENC_BORDER_IN_PIXELS, 0)) + return 1; + vpx_free_frame_buffer(&buf->img); + buf->img = new_img; + } else if (new_dimensions) { + buf->img.y_width = src->y_width; + buf->img.y_height = src->y_height; + buf->img.uv_width = src->uv_width; + buf->img.uv_height = src->uv_height; + buf->img.y_crop_width = src->y_crop_width; + buf->img.y_crop_height = src->y_crop_height; + buf->img.uv_crop_width = src->uv_crop_width; + buf->img.uv_crop_height = src->uv_crop_height; + buf->img.subsampling_x = src->subsampling_x; + buf->img.subsampling_y = src->subsampling_y; } -#endif + vp9_copy_and_extend_frame(src, &buf->img); buf->ts_start = ts_start; buf->ts_end = ts_end; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c index 0843cd97e4..6e124f9944 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c @@ -10,6 +10,7 @@ #include <assert.h> +#include "vpx_util/vpx_pthread.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_multi_thread.h" diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c index 3f4fe6957b..d37e020b0a 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c @@ -12,6 +12,7 @@ #include <math.h> #include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/bitops.h" #include "vpx_ports/mem.h" #include "vp9/common/vp9_quant_common.h" diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c index 62d6b93028..76d5435e60 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c @@ -35,6 +35,7 @@ #include "vp9/encoder/vp9_ext_ratectrl.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_svc_layercontext.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_ext_ratectrl.h" @@ -1433,8 +1434,8 @@ static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index, return q; } -static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, - int *top_index, int gf_group_index) { +int vp9_rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, + int *top_index, int gf_group_index) { const VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -1581,7 +1582,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, q = active_worst_quality; } } - clamp(q, active_best_quality, active_worst_quality); *top_index = active_worst_quality; *bottom_index = active_best_quality; @@ -1603,8 +1603,8 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, else q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index); } else { - q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index, - gf_group_index); + q = vp9_rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index, + gf_group_index); } if (cpi->sf.use_nonrd_pick_mode) { if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex; @@ -1675,63 +1675,6 @@ void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) { } } -void vp9_estimate_qp_gop(VP9_COMP *cpi) { - int gop_length = cpi->twopass.gf_group.gf_group_size; - int bottom_index, top_index; - int idx; - const int gf_index = cpi->twopass.gf_group.index; - const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref; - const int refresh_frame_context = cpi->common.refresh_frame_context; - - for (idx = 1; idx <= gop_length; ++idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[idx]; - int target_rate = cpi->twopass.gf_group.bit_allocation[idx]; - cpi->twopass.gf_group.index = idx; - vp9_rc_set_frame_target(cpi, target_rate); - vp9_configure_buffer_updates(cpi, idx); - if (cpi->tpl_with_external_rc) { - if (cpi->ext_ratectrl.ready && - (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && - cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { - VP9_COMMON *cm = &cpi->common; - vpx_codec_err_t codec_status; - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - vpx_rc_encodeframe_decision_t encode_frame_decision; - FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; - RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; - const RefCntBuffer *curr_frame_buf = - get_ref_cnt_buffer(cm, cm->new_fb_idx); - // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. - // index 1 refers to the first encoding frame in a gf group. - // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. - // See function define_gf_group_structure(). - const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; - const int frame_coding_index = cm->current_frame_coding_index + idx - 1; - get_ref_frame_bufs(cpi, ref_frame_bufs); - codec_status = vp9_extrc_get_encodeframe_decision( - &cpi->ext_ratectrl, curr_frame_buf->frame_index, frame_coding_index, - gf_group->index, update_type, gf_group->gf_group_size, use_alt_ref, - ref_frame_bufs, 0 /*ref_frame_flags is not used*/, - &encode_frame_decision); - if (codec_status != VPX_CODEC_OK) { - vpx_internal_error(&cm->error, codec_status, - "vp9_extrc_get_encodeframe_decision() failed"); - } - tpl_frame->base_qindex = encode_frame_decision.q_index; - } - } else { - tpl_frame->base_qindex = - rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx); - tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1); - } - } - // Reset the actual index and frame update - cpi->twopass.gf_group.index = gf_index; - cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref; - cpi->common.refresh_frame_context = refresh_frame_context; - vp9_configure_buffer_updates(cpi, gf_index); -} - void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit) { @@ -3361,14 +3304,20 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor; } // For temporal layers, reset the rate control parametes across all - // temporal layers. If the first_spatial_layer_to_encode > 0, then this - // superframe has skipped lower base layers. So in this case we should also - // reset and force max-q for spatial layers < first_spatial_layer_to_encode. + // temporal layers. + // If the first_spatial_layer_to_encode > 0, then this superframe has + // skipped lower base layers. So in this case we should also reset and + // force max-q for spatial layers < first_spatial_layer_to_encode. + // For the case of no inter-layer prediction on delta frames: reset and + // force max-q for all spatial layers, to avoid excessive frame drops. if (cpi->use_svc) { int tl = 0; int sl = 0; SVC *svc = &cpi->svc; - for (sl = 0; sl < VPXMAX(1, svc->first_spatial_layer_to_encode); ++sl) { + int num_spatial_layers = VPXMAX(1, svc->first_spatial_layer_to_encode); + if (svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON) + num_spatial_layers = svc->number_spatial_layers; + for (sl = 0; sl < num_spatial_layers; ++sl) { for (tl = 0; tl < svc->number_temporal_layers; ++tl) { const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h index 48c49e937e..0c61ad3461 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h @@ -346,12 +346,14 @@ int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q); void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index); -void vp9_estimate_qp_gop(struct VP9_COMP *cpi); - void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi); void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi); +int vp9_rc_pick_q_and_bounds_two_pass(const struct VP9_COMP *cpi, + int *bottom_index, int *top_index, + int gf_group_index); + #ifdef __cplusplus } // extern "C" #endif diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c index 974e43c90f..447136ed84 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c @@ -1834,7 +1834,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi, return 1; } -static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) { +static INLINE int skip_iters(int_mv iter_mvs[][2], int ite, int id) { if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) { int_mv cur_fullpel_mv, prev_fullpel_mv; cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c index b8910370e0..048ab8732d 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c @@ -18,9 +18,12 @@ #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_tpl_model.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, const GF_GROUP *gf_group, int *tpl_group_frames) { @@ -407,8 +410,12 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats, tpl_block_stats_ptr->col = mi_col * 8; tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; - tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; - tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; + // inter/intra_cost here is calculated with SATD which should be close + // enough to be used as inter/intra_pred_error + tpl_block_stats_ptr->inter_pred_err = src_stats->inter_cost; + tpl_block_stats_ptr->intra_pred_err = src_stats->intra_cost; + tpl_block_stats_ptr->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row; tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col; tpl_block_stats_ptr->ref_frame_index = ref_frame_idx; @@ -721,7 +728,9 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); tpl_stats->intra_cost = VPXMAX( 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); - tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + if (best_rf_idx >= 0) { + tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + } tpl_stats->mv.as_int = best_mv.as_int; *ref_frame_idx = best_rf_idx; } @@ -1489,6 +1498,53 @@ static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { } #endif // CONFIG_RATE_CTRL +void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi) { + int gop_length = cpi->twopass.gf_group.gf_group_size; + int bottom_index, top_index; + int idx; + const int gf_index = cpi->twopass.gf_group.index; + const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref; + const int refresh_frame_context = cpi->common.refresh_frame_context; + + for (idx = 1; idx <= gop_length; ++idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[idx]; + int target_rate = cpi->twopass.gf_group.bit_allocation[idx]; + cpi->twopass.gf_group.index = idx; + vp9_rc_set_frame_target(cpi, target_rate); + vp9_configure_buffer_updates(cpi, idx); + if (cpi->tpl_with_external_rc) { + VP9_COMMON *cm = &cpi->common; + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && + cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { + vpx_codec_err_t codec_status; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + vpx_rc_encodeframe_decision_t encode_frame_decision; + codec_status = vp9_extrc_get_encodeframe_decision( + &cpi->ext_ratectrl, gf_group->index - 1, &encode_frame_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_encodeframe_decision() failed"); + } + tpl_frame->base_qindex = encode_frame_decision.q_index; + } else { + vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, + "The external rate control library is not set " + "properly for TPL pass."); + } + } else { + tpl_frame->base_qindex = vp9_rc_pick_q_and_bounds_two_pass( + cpi, &bottom_index, &top_index, idx); + tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1); + } + } + // Reset the actual index and frame update + cpi->twopass.gf_group.index = gf_index; + cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref; + cpi->common.refresh_frame_context = refresh_frame_context; + vp9_configure_buffer_updates(cpi, gf_index); +} + void vp9_setup_tpl_stats(VP9_COMP *cpi) { GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; const GF_GROUP *gf_group = &cpi->twopass.gf_group; @@ -1512,12 +1568,16 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) { mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); } - // TPL stats has extra frames from next GOP. Trim those extra frames for - // Qmode. - trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count); - if (cpi->ext_ratectrl.ready && cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) { + // Intra search on key frame + if (gf_picture[0].update_type == KF_UPDATE) { + mc_flow_dispenser(cpi, gf_picture, 0, cpi->tpl_bsize); + } + // TPL stats has extra frames from next GOP. Trim those extra frames for + // Qmode. + trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, + extended_frame_count); const vpx_codec_err_t codec_status = vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats); if (codec_status != VPX_CODEC_OK) { diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h index 04beb22610..de0ac39a1f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h @@ -31,6 +31,7 @@ typedef struct GF_PICTURE { void vp9_init_tpl_buffer(VP9_COMP *cpi); void vp9_setup_tpl_stats(VP9_COMP *cpi); void vp9_free_tpl_buffer(VP9_COMP *cpi); +void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi); void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, TX_SIZE tx_size); diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c index 94506aad0f..628dc4fead 100644 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -886,14 +886,14 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, scale_plane_1_to_2_phase_0( src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w, src_h, vp9_filter_kernels[filter_type][8], temp_buffer); - scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, - dst->uv_stride, src_w / 2, src_h / 2, - vp9_filter_kernels[filter_type][8], - temp_buffer); - scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, - dst->uv_stride, src_w / 2, src_h / 2, - vp9_filter_kernels[filter_type][8], - temp_buffer); + const int src_uv_w = src->uv_crop_width; + const int src_uv_h = src->uv_crop_height; + scale_plane_1_to_2_phase_0( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer); + scale_plane_1_to_2_phase_0( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer); free(temp_buffer); } } diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc index fd81bce7b5..942c15ce49 100644 --- a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc +++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc @@ -12,10 +12,12 @@ #include <new> #include "vp9/common/vp9_common.h" +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_picklpf.h" #include "vpx/vp8cx.h" #include "vpx/vpx_codec.h" +#include "vpx_mem/vpx_mem.h" namespace libvpx { diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.h b/media/libvpx/libvpx/vp9/ratectrl_rtc.h index 85005c5474..4c39255886 100644 --- a/media/libvpx/libvpx/vp9/ratectrl_rtc.h +++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h @@ -12,43 +12,34 @@ #define VPX_VP9_RATECTRL_RTC_H_ #include <cstdint> +#include <cstring> +#include <limits> #include <memory> -#include "vp9/common/vp9_enums.h" -#include "vp9/vp9_iface_common.h" -#include "vp9/encoder/vp9_aq_cyclicrefresh.h" -#include "vp9/vp9_cx_iface.h" +#include "vpx/vpx_encoder.h" #include "vpx/internal/vpx_ratectrl_rtc.h" -#include "vpx_mem/vpx_mem.h" struct VP9_COMP; namespace libvpx { struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { - public: VP9RateControlRtcConfig() { - ss_number_layers = 1; - vp9_zero(max_quantizers); - vp9_zero(min_quantizers); - vp9_zero(scaling_factor_den); - vp9_zero(scaling_factor_num); - vp9_zero(layer_target_bitrate); - vp9_zero(ts_rate_decimator); + memset(layer_target_bitrate, 0, sizeof(layer_target_bitrate)); + memset(ts_rate_decimator, 0, sizeof(ts_rate_decimator)); scaling_factor_num[0] = 1; scaling_factor_den[0] = 1; max_quantizers[0] = max_quantizer; min_quantizers[0] = min_quantizer; - max_consec_drop = INT_MAX; } // Number of spatial layers - int ss_number_layers; - int max_quantizers[VPX_MAX_LAYERS]; - int min_quantizers[VPX_MAX_LAYERS]; - int scaling_factor_num[VPX_SS_MAX_LAYERS]; - int scaling_factor_den[VPX_SS_MAX_LAYERS]; + int ss_number_layers = 1; + int max_quantizers[VPX_MAX_LAYERS] = {}; + int min_quantizers[VPX_MAX_LAYERS] = {}; + int scaling_factor_num[VPX_SS_MAX_LAYERS] = {}; + int scaling_factor_den[VPX_SS_MAX_LAYERS] = {}; // This is only for SVC for now. - int max_consec_drop; + int max_consec_drop = std::numeric_limits<int>::max(); }; struct VP9FrameParamsQpRTC { @@ -105,9 +96,9 @@ class VP9RateControlRTC { const VP9FrameParamsQpRTC &frame_params); private: - VP9RateControlRTC() {} + VP9RateControlRTC() = default; bool InitRateControl(const VP9RateControlRtcConfig &cfg); - struct VP9_COMP *cpi_; + struct VP9_COMP *cpi_ = nullptr; }; } // namespace libvpx diff --git a/media/libvpx/libvpx/vp9/simple_encode.cc b/media/libvpx/libvpx/vp9/simple_encode.cc index 2e6f9a4513..5e565d1b1a 100644 --- a/media/libvpx/libvpx/vp9/simple_encode.cc +++ b/media/libvpx/libvpx/vp9/simple_encode.cc @@ -8,8 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <stdio.h> +#include <stdlib.h> + #include <memory> #include <vector> + #include "./ivfenc.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_enums.h" @@ -888,6 +892,10 @@ void SimpleEncode::ComputeFirstPassStats() { use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth; #endif vpx_image_t img; + if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) { + fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n"); + abort(); + } vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1); rewind(in_file_); impl_ptr_->first_pass_stats.clear(); @@ -1053,6 +1061,10 @@ void SimpleEncode::StartEncode() { vp9_set_first_pass_stats(&oxcf, &stats); assert(impl_ptr_->cpi == nullptr); impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt); + if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) { + fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n"); + abort(); + } vpx_img_alloc(&impl_ptr_->tmp_img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1); diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c index 8df04f29f0..fe62bac5f2 100644 --- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c +++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <limits.h> +#include <stdint.h> #include <stdlib.h> #include <string.h> @@ -17,6 +19,7 @@ #include "vpx_dsp/psnr.h" #include "vpx_ports/static_assert.h" #include "vpx_ports/system_state.h" +#include "vpx_util/vpx_thread.h" #include "vpx_util/vpx_timestamp.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vpx_version.h" @@ -110,7 +113,6 @@ struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_enc_cfg_t cfg; struct vp9_extracfg extra_cfg; - vpx_rational64_t timestamp_ratio; vpx_codec_pts_t pts_offset; unsigned char pts_offset_initialized; VP9EncoderConfig oxcf; @@ -190,7 +192,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2); RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1); RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1); - RANGE_CHECK_HI(cfg, g_threads, 64); + RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS); RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q); RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); @@ -1140,10 +1142,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, if (res == VPX_CODEC_OK) { priv->pts_offset_initialized = 0; - // TODO(angiebird): Replace priv->timestamp_ratio by - // oxcf->g_timebase_in_ts - priv->timestamp_ratio = get_g_timebase_in_ts(priv->cfg.g_timebase); - set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); #if CONFIG_VP9_HIGHBITDEPTH priv->oxcf.use_highbitdepth = @@ -1166,9 +1164,9 @@ static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) { return VPX_CODEC_OK; } -static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, - unsigned long duration, - vpx_enc_deadline_t deadline) { +static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, + unsigned long duration, + vpx_enc_deadline_t deadline) { MODE new_mode = BEST; #if CONFIG_REALTIME_ONLY @@ -1179,13 +1177,16 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, case VPX_RC_ONE_PASS: if (deadline > 0) { // Convert duration parameter from stream timebase to microseconds. - uint64_t duration_us; - VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 && (TICKS_PER_SEC % 1000000) == 0); - duration_us = duration * (uint64_t)ctx->timestamp_ratio.num / - (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); + if (duration > UINT64_MAX / (uint64_t)ctx->oxcf.g_timebase_in_ts.num) { + ERROR("duration is too big"); + } + uint64_t duration_us = duration * + (uint64_t)ctx->oxcf.g_timebase_in_ts.num / + ((uint64_t)ctx->oxcf.g_timebase_in_ts.den * + (TICKS_PER_SEC / 1000000)); // If the deadline is more that the duration this frame is to be shown, // use good quality mode. Otherwise use realtime mode. @@ -1208,6 +1209,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, ctx->oxcf.mode = new_mode; vp9_change_config(ctx->cpi, &ctx->oxcf); } + return VPX_CODEC_OK; } // Turn on to test if supplemental superframe data breaks decoding @@ -1281,6 +1283,10 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, .is_key_frame)) flags |= VPX_FRAME_IS_KEY; + if (!cpi->common.show_frame) { + flags |= VPX_FRAME_IS_INVISIBLE; + } + if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE; return flags; @@ -1318,7 +1324,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, volatile vpx_enc_frame_flags_t flags = enc_flags; volatile vpx_codec_pts_t pts = pts_val; VP9_COMP *const cpi = ctx->cpi; - const vpx_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio; + const vpx_rational64_t *const timebase_in_ts = &ctx->oxcf.g_timebase_in_ts; size_t data_sz; vpx_codec_cx_pkt_t pkt; memset(&pkt, 0, sizeof(pkt)); @@ -1347,13 +1353,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } } - if (!ctx->pts_offset_initialized) { - ctx->pts_offset = pts; - ctx->pts_offset_initialized = 1; + res = pick_quickcompress_mode(ctx, duration, deadline); + if (res != VPX_CODEC_OK) { + return res; } - pts -= ctx->pts_offset; - - pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); // Handle Flags @@ -1384,20 +1387,53 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK) { unsigned int lib_flags = 0; - YV12_BUFFER_CONFIG sd; - int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts); size_t size, cx_data_sz; unsigned char *cx_data; - cpi->svc.timebase_fac = timebase_units_to_ticks(timestamp_ratio, 1); - cpi->svc.time_stamp_superframe = dst_time_stamp; - // Set up internal flags if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1; if (img != NULL) { + YV12_BUFFER_CONFIG sd; + + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = pts; + ctx->pts_offset_initialized = 1; + } + if (pts < ctx->pts_offset) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM, + "pts is smaller than initial pts"); + } + pts -= ctx->pts_offset; + if (pts > INT64_MAX / timebase_in_ts->num) { + vpx_internal_error( + &cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts to ticks would overflow"); + } + const int64_t dst_time_stamp = + timebase_units_to_ticks(timebase_in_ts, pts); + + cpi->svc.timebase_fac = timebase_units_to_ticks(timebase_in_ts, 1); + cpi->svc.time_stamp_superframe = dst_time_stamp; + +#if ULONG_MAX > INT64_MAX + if (duration > INT64_MAX) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM, + "duration is too big"); + } +#endif + if (pts > INT64_MAX - (int64_t)duration) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM, + "relative pts + duration is too big"); + } + vpx_codec_pts_t pts_end = pts + (int64_t)duration; + if (pts_end > INT64_MAX / timebase_in_ts->num) { + vpx_internal_error( + &cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts + duration to ticks would overflow"); + } const int64_t dst_end_time_stamp = - timebase_units_to_ticks(timestamp_ratio, pts + duration); + timebase_units_to_ticks(timebase_in_ts, pts_end); res = image2yuvconfig(img, &sd); if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { @@ -1434,7 +1470,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (cx_data_sz < ctx->cx_data_sz / 2) { vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR, "Compressed data buffer too small"); - return VPX_CODEC_ERROR; } } @@ -1443,6 +1478,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // compute first pass stats if (img) { int ret; + int64_t dst_time_stamp; int64_t dst_end_time_stamp; vpx_codec_cx_pkt_t fps_pkt; ENCODE_FRAME_RESULT encode_frame_result; @@ -1469,6 +1505,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, #endif // !CONFIG_REALTIME_ONLY } else { ENCODE_FRAME_RESULT encode_frame_result; + int64_t dst_time_stamp; int64_t dst_end_time_stamp; vp9_init_encode_frame_result(&encode_frame_result); while (cx_data_sz >= ctx->cx_data_sz / 2 && @@ -1507,10 +1544,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (ctx->output_cx_pkt_cb.output_cx_pkt) { pkt.kind = VPX_CODEC_CX_FRAME_PKT; pkt.data.frame.pts = - ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) + + ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) + ctx->pts_offset; pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( - timestamp_ratio, dst_end_time_stamp - dst_time_stamp); + timebase_in_ts, dst_end_time_stamp - dst_time_stamp); pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); pkt.data.frame.buf = ctx->pending_cx_data; pkt.data.frame.sz = size; @@ -1527,10 +1564,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // Add the frame packet to the list of returned packets. pkt.kind = VPX_CODEC_CX_FRAME_PKT; pkt.data.frame.pts = - ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) + + ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) + ctx->pts_offset; pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( - timestamp_ratio, dst_end_time_stamp - dst_time_stamp); + timebase_in_ts, dst_end_time_stamp - dst_time_stamp); pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height; @@ -1979,6 +2016,7 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, ratectrl_config.frame_rate_den = oxcf->g_timebase.num; ratectrl_config.overshoot_percent = oxcf->over_shoot_pct; ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; + ratectrl_config.base_qp = oxcf->cq_level; if (oxcf->rc_mode == VPX_VBR) { ratectrl_config.rc_mode = VPX_RC_VBR; @@ -2223,7 +2261,7 @@ static vpx_codec_enc_cfg_t get_enc_cfg(int frame_width, int frame_height, return enc_cfg; } -static vp9_extracfg get_extra_cfg() { +static vp9_extracfg get_extra_cfg(void) { vp9_extracfg extra_cfg = default_extra_cfg; return extra_cfg; } diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/libvpx/vp9/vp9_dx_iface.c index 860f721dc5..7567910b9b 100644 --- a/media/libvpx/libvpx/vp9/vp9_dx_iface.c +++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.c @@ -19,7 +19,6 @@ #include "vpx/vpx_decoder.h" #include "vpx_dsp/bitreader_buffer.h" #include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_util/vpx_thread.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_frame_buffers.h" diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk index 44790ef6a4..7a0e2d8d1f 100644 --- a/media/libvpx/libvpx/vp9/vp9cx.mk +++ b/media/libvpx/libvpx/vp9/vp9cx.mk @@ -140,6 +140,7 @@ endif VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c +VP9_CX_SRCS-$(HAVE_SVE) += encoder/arm/neon/vp9_error_sve.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) diff --git a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h index 01d64b14b7..2643b5578a 100644 --- a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h +++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h @@ -22,8 +22,14 @@ enum class FrameDropDecision { kDrop, // Frame is dropped. }; +struct UVDeltaQP { + // For the UV channel: the QP for the dc/ac value is given as + // GetQP() + uvdc/ac_delta_q, where the uvdc/ac_delta_q are negative numbers. + int uvdc_delta_q; + int uvac_delta_q; +}; + struct VpxRateControlRtcConfig { - public: VpxRateControlRtcConfig() { width = 1280; height = 720; diff --git a/media/libvpx/libvpx/vpx/src/vpx_encoder.c b/media/libvpx/libvpx/vpx/src/vpx_encoder.c index 017525aeee..001d854abe 100644 --- a/media/libvpx/libvpx/vpx/src/vpx_encoder.c +++ b/media/libvpx/libvpx/vpx/src/vpx_encoder.c @@ -14,6 +14,7 @@ */ #include <assert.h> #include <limits.h> +#include <stdint.h> #include <stdlib.h> #include <string.h> #include "vp8/common/blockd.h" @@ -184,8 +185,8 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, while (0) #else -static void FLOATING_POINT_INIT() {} -static void FLOATING_POINT_RESTORE() {} +static void FLOATING_POINT_INIT(void) {} +static void FLOATING_POINT_RESTORE(void) {} #endif vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, @@ -200,6 +201,10 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, res = VPX_CODEC_ERROR; else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; +#if ULONG_MAX > UINT32_MAX + else if (duration > UINT32_MAX || deadline > UINT32_MAX) + res = VPX_CODEC_INVALID_PARAM; +#endif else { unsigned int num_enc = ctx->priv->enc.total_encoders; diff --git a/media/libvpx/libvpx/vpx/src/vpx_image.c b/media/libvpx/libvpx/vpx/src/vpx_image.c index f9f0dd6025..3f7ff74244 100644 --- a/media/libvpx/libvpx/vpx/src/vpx_image.c +++ b/media/libvpx/libvpx/vpx/src/vpx_image.c @@ -27,6 +27,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, if (img != NULL) memset(img, 0, sizeof(vpx_image_t)); + if (fmt == VPX_IMG_FMT_NONE) goto fail; + /* Treat align==0 like align==1 */ if (!buf_align) buf_align = 1; @@ -56,7 +58,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, /* Get chroma shift values for this format */ // For VPX_IMG_FMT_NV12, xcs needs to be 0 such that UV data is all read at - // one time. + // once. switch (fmt) { case VPX_IMG_FMT_I420: case VPX_IMG_FMT_YV12: diff --git a/media/libvpx/libvpx/vpx/src/vpx_tpl.c b/media/libvpx/libvpx/vpx/src/vpx_tpl.c index 62c2a9c857..b0687a8135 100644 --- a/media/libvpx/libvpx/vpx/src/vpx_tpl.c +++ b/media/libvpx/libvpx/vpx/src/vpx_tpl.c @@ -47,8 +47,8 @@ vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file, "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64 " %" PRId64 " %d\n", block_stats.inter_cost, block_stats.intra_cost, - block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist, - block_stats.recrf_rate, block_stats.ref_frame_index)); + block_stats.mv_c, block_stats.mv_r, block_stats.srcrf_dist, + block_stats.srcrf_rate, block_stats.ref_frame_index)); } } @@ -88,7 +88,7 @@ vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, " %" SCNd64 " %d\n", &block_stats->inter_cost, &block_stats->intra_cost, &block_stats->mv_c, &block_stats->mv_r, - &block_stats->recrf_dist, &block_stats->recrf_rate, + &block_stats->srcrf_dist, &block_stats->srcrf_rate, &block_stats->ref_frame_index), 7); } diff --git a/media/libvpx/libvpx/vpx/vp8cx.h b/media/libvpx/libvpx/vpx/vp8cx.h index b12938d3d8..dfdbb3c770 100644 --- a/media/libvpx/libvpx/vpx/vp8cx.h +++ b/media/libvpx/libvpx/vpx/vp8cx.h @@ -772,6 +772,8 @@ enum vp8e_enc_control_id { /*!\brief Codec control to use external RC to control TPL. * * This will use external RC to control the QP and GOP structure for TPL. + * (rc_type & VPX_RC_QP) in vpx_rc_funcs_t must be non zero. + * get_encodeframe_decision callback in vpx_rc_funcs_t also needs to be set. * * Supported in codecs: VP9 */ diff --git a/media/libvpx/libvpx/vpx/vpx_encoder.h b/media/libvpx/libvpx/vpx/vpx_encoder.h index 18e3862bd7..809a097d94 100644 --- a/media/libvpx/libvpx/vpx/vpx_encoder.h +++ b/media/libvpx/libvpx/vpx/vpx_encoder.h @@ -31,7 +31,6 @@ extern "C" { #include "./vpx_codec.h" // IWYU pragma: export #include "./vpx_ext_ratectrl.h" -#include "./vpx_tpl.h" /*! Temporal Scalability: Maximum length of the sequence defining frame * layer membership @@ -57,10 +56,15 @@ extern "C" { * must be bumped. Examples include, but are not limited to, changing * types, removing or reassigning enums, adding/removing/rearranging * fields to structures + * + * \note + * VPX_ENCODER_ABI_VERSION has a VPX_EXT_RATECTRL_ABI_VERSION component + * because the VP9E_SET_EXTERNAL_RATE_CONTROL codec control uses + * vpx_rc_funcs_t. */ -#define VPX_ENCODER_ABI_VERSION \ - (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \ - VPX_TPL_ABI_VERSION) /**<\hideinitializer*/ +#define VPX_ENCODER_ABI_VERSION \ + (18 + VPX_CODEC_ABI_VERSION + \ + VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -1074,6 +1078,12 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, * The buffer was set successfully. * \retval #VPX_CODEC_INVALID_PARAM * A parameter was NULL, the image format is unsupported, etc. + * + * \note + * `duration` and `deadline` are of the unsigned long type, which can be 32 + * or 64 bits. `duration` and `deadline` must be less than or equal to + * UINT32_MAX so that their ranges are independent of the size of unsigned + * long. */ vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, const vpx_fixed_buf_t *buf, diff --git a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h index 46d290dff4..ba12e4f83b 100644 --- a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h +++ b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h @@ -26,7 +26,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ -#define VPX_EXT_RATECTRL_ABI_VERSION (7) +#define VPX_EXT_RATECTRL_ABI_VERSION (5 + VPX_TPL_ABI_VERSION) /*!\brief The control type of the inference API. * In VPX_RC_QP mode, the external rate control model determines the @@ -81,17 +81,10 @@ typedef void *vpx_rc_model_t; * * The encoder will receive the decision from the external rate control model * through get_encodeframe_decision() defined in vpx_rc_funcs_t. - * - * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q. - * - * If max_frame_size = 0, the encoding ignores max frame size limit. - * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit. - * If the encoded frame size is larger than max_frame_size, the frame is - * recoded to meet the size limit, following VP9's recoding principles. */ typedef struct vpx_rc_encodeframe_decision { - int q_index; /**< Quantizer step index [0..255]*/ - int max_frame_size; /**< Maximal frame size allowed to encode a frame*/ + int q_index; /**< Quantizer step index [0..255]*/ + int rdmult; /**< Frame level Lagrangian multiplier*/ } vpx_rc_encodeframe_decision_t; /*!\brief Information for the frame to be encoded. @@ -322,6 +315,7 @@ typedef struct vpx_rc_config { vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */ int overshoot_percent; /**< for VBR mode only */ int undershoot_percent; /**< for VBR mode only */ + int base_qp; /**< base QP for leaf frames, 0-255 */ } vpx_rc_config_t; /*!\brief Information passed to the external rate control model to @@ -400,6 +394,7 @@ typedef struct vpx_rc_gop_info { typedef struct vpx_rc_gop_decision { int gop_coding_frames; /**< The number of frames of this GOP */ int use_alt_ref; /**< Whether to use alt ref for this GOP */ + int use_key_frame; /**< Whether to set key frame for this GOP */ } vpx_rc_gop_decision_t; /*!\brief Create an external rate control model callback prototype @@ -446,12 +441,11 @@ typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)( * the external rate control model. * * \param[in] rate_ctrl_model rate control model - * \param[in] encode_frame_info information of the coding frame + * \param[in] frame_gop_index index of the frame in current gop * \param[out] frame_decision encode decision of the coding frame */ typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)( - vpx_rc_model_t rate_ctrl_model, - const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_model_t rate_ctrl_model, const int frame_gop_index, vpx_rc_encodeframe_decision_t *frame_decision); /*!\brief Update encode frame result callback prototype @@ -472,12 +466,10 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)( * the external rate control model. * * \param[in] rate_ctrl_model rate control model - * \param[in] gop_info information collected from the encoder * \param[out] gop_decision GOP decision from the model */ typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)( - vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, - vpx_rc_gop_decision_t *gop_decision); + vpx_rc_model_t rate_ctrl_model, vpx_rc_gop_decision_t *gop_decision); /*!\brief Get the frame rdmult from the external rate control model. * diff --git a/media/libvpx/libvpx/vpx/vpx_tpl.h b/media/libvpx/libvpx/vpx/vpx_tpl.h index a250aada60..7e4c9ab7e1 100644 --- a/media/libvpx/libvpx/vpx/vpx_tpl.h +++ b/media/libvpx/libvpx/vpx/vpx_tpl.h @@ -32,19 +32,21 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/ +#define VPX_TPL_ABI_VERSION (3) /**<\hideinitializer*/ /*!\brief Temporal dependency model stats for each block before propagation */ typedef struct VpxTplBlockStats { - int16_t row; /**< Pixel row of the top left corner */ - int16_t col; /**< Pixel col of the top left corner */ - int64_t intra_cost; /**< Intra cost */ - int64_t inter_cost; /**< Inter cost */ - int16_t mv_r; /**< Motion vector row */ - int16_t mv_c; /**< Motion vector col */ - int64_t recrf_rate; /**< Rate from reconstructed ref frame */ - int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ - int ref_frame_index; /**< Ref frame index in the ref frame buffer */ + int16_t row; /**< Pixel row of the top left corner */ + int16_t col; /**< Pixel col of the top left corner */ + int64_t intra_cost; /**< Intra cost */ + int64_t inter_cost; /**< Inter cost */ + int16_t mv_r; /**< Motion vector row */ + int16_t mv_c; /**< Motion vector col */ + int64_t srcrf_rate; /**< Rate from source ref frame */ + int64_t srcrf_dist; /**< Distortion from source ref frame */ + int64_t inter_pred_err; /**< Inter prediction error */ + int64_t intra_pred_err; /**< Intra prediction error */ + int ref_frame_index; /**< Ref frame index in the ref frame buffer */ } VpxTplBlockStats; /*!\brief Temporal dependency model stats for each frame before propagation */ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c index 683df5797a..f8b94620d4 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -168,40 +168,40 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, \ if (xoffset == 0) { \ if (yoffset == 0) { \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ h); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ src_stride, h, yoffset); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ @@ -209,21 +209,21 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, if (yoffset == 0) { \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ xoffset); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } \ @@ -430,22 +430,22 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, } while (--i != 0); } -#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ - uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t tmp0[w * (h + 1)]; \ - uint16_t tmp1[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ - xoffset); \ - highbd_avg_pred_var_filter_block2d_bil_w##w( \ - tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ - CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ } #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ @@ -460,19 +460,19 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, if (yoffset == 0) { \ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp, source_stride, source_stride, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else { \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp, source_stride, source_stride, h, yoffset, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ @@ -481,7 +481,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp0, source_stride, 1, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ @@ -489,7 +489,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, (h + 1)); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ @@ -497,7 +497,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, (h + 1)); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ @@ -506,7 +506,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp0, source_stride, 1, h, xoffset, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ @@ -514,7 +514,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ @@ -522,7 +522,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } \ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c new file mode 100644 index 0000000000..cebe06b099 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" +#include "vpx_ports/mem.h" + +static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h) { + uint64x2_t sse = vdupq_n_u64(0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse = vpx_dotq_u16(sse, diff, diff); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + return (uint32_t)horizontal_add_uint64x2(sse); +} + +#define HIGHBD_MSE_WXH_SVE(w, h) \ + uint32_t vpx_highbd_10_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint32_t sse_tmp = \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h); \ + sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 4); \ + *sse = sse_tmp; \ + return sse_tmp; \ + } \ + \ + uint32_t vpx_highbd_12_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint32_t sse_tmp = \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h); \ + sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 8); \ + *sse = sse_tmp; \ + return sse_tmp; \ + } + +HIGHBD_MSE_WXH_SVE(16, 16) +HIGHBD_MSE_WXH_SVE(16, 8) +HIGHBD_MSE_WXH_SVE(8, 16) +HIGHBD_MSE_WXH_SVE(8, 8) + +#undef HIGHBD_MSE_WXH_SVE + +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + sse_s64 = vpx_dotq_s16(sse_s64, diff, diff); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + h -= 2; + } while (h != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = horizontal_add_int64x2(sse_s64); +} + +static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + const uint16x8_t s = vld1q_u16(src_ptr); + const uint16x8_t r = vld1q_u16(ref_ptr); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s32 = vpadalq_s16(sum_s32, diff); + sse_s64 = vpx_dotq_s16(sse_s64, diff, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_add_int64x2(sse_s64); +} + +static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + const uint16x8_t s0 = vld1q_u16(src_ptr); + const uint16x8_t s1 = vld1q_u16(src_ptr + 8); + + const uint16x8_t r0 = vld1q_u16(ref_ptr); + const uint16x8_t r1 = vld1q_u16(ref_ptr + 8); + + const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1)); + + sum_s32[0] = vpadalq_s16(sum_s32[0], diff0); + sum_s32[1] = vpadalq_s16(sum_s32[1], diff1); + + sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0); + sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]); + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]); + + *sum = horizontal_add_int32x4(sum_s32[0]); + *sse = horizontal_add_int64x2(sse_s64[0]); +} + +static INLINE void highbd_variance_wxh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + int i = 0; + do { + const uint16x8_t s0 = vld1q_u16(src_ptr + i); + const uint16x8_t s1 = vld1q_u16(src_ptr + i + 8); + const uint16x8_t s2 = vld1q_u16(src_ptr + i + 16); + const uint16x8_t s3 = vld1q_u16(src_ptr + i + 24); + + const uint16x8_t r0 = vld1q_u16(ref_ptr + i); + const uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8); + const uint16x8_t r2 = vld1q_u16(ref_ptr + i + 16); + const uint16x8_t r3 = vld1q_u16(ref_ptr + i + 24); + + const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1)); + const int16x8_t diff2 = vreinterpretq_s16_u16(vsubq_u16(s2, r2)); + const int16x8_t diff3 = vreinterpretq_s16_u16(vsubq_u16(s3, r3)); + + sum_s32[0] = vpadalq_s16(sum_s32[0], diff0); + sum_s32[1] = vpadalq_s16(sum_s32[1], diff1); + sum_s32[2] = vpadalq_s16(sum_s32[2], diff2); + sum_s32[3] = vpadalq_s16(sum_s32[3], diff3); + + sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0); + sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1); + sse_s64[2] = vpx_dotq_s16(sse_s64[2], diff2, diff2); + sse_s64[3] = vpx_dotq_s16(sse_s64[3], diff3, diff3); + + i += 32; + } while (i < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]); + sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]); + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[2]); + + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]); + sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]); + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[2]); + + *sum = horizontal_add_int32x4(sum_s32[0]); + *sse = horizontal_add_int64x2(sse_s64[0]); +} + +static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} + +static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} + +#define HBD_VARIANCE_WXH_SVE(w, h) \ + uint32_t vpx_highbd_8_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +HBD_VARIANCE_WXH_SVE(4, 4) +HBD_VARIANCE_WXH_SVE(4, 8) + +HBD_VARIANCE_WXH_SVE(8, 4) +HBD_VARIANCE_WXH_SVE(8, 8) +HBD_VARIANCE_WXH_SVE(8, 16) + +HBD_VARIANCE_WXH_SVE(16, 8) +HBD_VARIANCE_WXH_SVE(16, 16) +HBD_VARIANCE_WXH_SVE(16, 32) + +HBD_VARIANCE_WXH_SVE(32, 16) +HBD_VARIANCE_WXH_SVE(32, 32) +HBD_VARIANCE_WXH_SVE(32, 64) + +HBD_VARIANCE_WXH_SVE(64, 32) +HBD_VARIANCE_WXH_SVE(64, 64) + +#define HIGHBD_GET_VAR_SVE(s) \ + void vpx_highbd_8_get##s##x##s##var_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + *sum = (int)sum_long; \ + } \ + \ + void vpx_highbd_10_get##s##x##s##var_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + } \ + \ + void vpx_highbd_12_get##s##x##s##var_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + } + +HIGHBD_GET_VAR_SVE(8) +HIGHBD_GET_VAR_SVE(16) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c index 47684473ca..b5a944d299 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -14,86 +14,51 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" -static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p, - int16x4_t *const s0, int16x4_t *const s1, - int16x4_t *const s2, int16x4_t *const s3) { - *s0 = vld1_s16(s); - s += p; - *s1 = vld1_s16(s); - s += p; - *s2 = vld1_s16(s); - s += p; - *s3 = vld1_s16(s); -} - -static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p, - uint16x8_t *const s0, uint16x8_t *const s1, - uint16x8_t *const s2, uint16x8_t *const s3) { - *s0 = vld1q_u16(s); - s += p; - *s1 = vld1q_u16(s); - s += p; - *s2 = vld1q_u16(s); - s += p; - *s3 = vld1q_u16(s); -} - -static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p, - int16x8_t *const s0, int16x8_t *const s1, - int16x8_t *const s2, int16x8_t *const s3, - int16x8_t *const s4, int16x8_t *const s5, - int16x8_t *const s6, int16x8_t *const s7) { - *s0 = vld1q_s16(s); - s += p; - *s1 = vld1q_s16(s); - s += p; - *s2 = vld1q_s16(s); - s += p; - *s3 = vld1q_s16(s); - s += p; - *s4 = vld1q_s16(s); - s += p; - *s5 = vld1q_s16(s); - s += p; - *s6 = vld1q_s16(s); - s += p; - *s7 = vld1q_s16(s); +static INLINE uint16x4_t highbd_convolve4_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t filters, const uint16x4_t max) { + int32x4_t sum = vmull_lane_s16(s0, filters, 0); + sum = vmlal_lane_s16(sum, s1, filters, 1); + sum = vmlal_lane_s16(sum, s2, filters, 2); + sum = vmlal_lane_s16(sum, s3, filters, 3); + + uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); + return vmin_u16(res, max); } -static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p, - const uint16x8_t s0, const uint16x8_t s1, - const uint16x8_t s2, const uint16x8_t s3, - const uint16x8_t s4, const uint16x8_t s5, - const uint16x8_t s6, const uint16x8_t s7) { - vst1q_u16(s, s0); - s += p; - vst1q_u16(s, s1); - s += p; - vst1q_u16(s, s2); - s += p; - vst1q_u16(s, s3); - s += p; - vst1q_u16(s, s4); - s += p; - vst1q_u16(s, s5); - s += p; - vst1q_u16(s, s6); - s += p; - vst1q_u16(s, s7); +static INLINE uint16x8_t highbd_convolve4_8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t filters, const uint16x8_t max) { + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters, 3); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); + return vminq_u16(res, max); } -static INLINE int32x4_t highbd_convolve8_4( - const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, - const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, - const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) { +static INLINE uint16x4_t +highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filters, const uint16x4_t max) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); - int32x4_t sum; - sum = vmull_lane_s16(s0, filters_lo, 0); + int32x4_t sum = vmull_lane_s16(s0, filters_lo, 0); sum = vmlal_lane_s16(sum, s1, filters_lo, 1); sum = vmlal_lane_s16(sum, s2, filters_lo, 2); sum = vmlal_lane_s16(sum, s3, filters_lo, 3); @@ -101,7 +66,9 @@ static INLINE int32x4_t highbd_convolve8_4( sum = vmlal_lane_s16(sum, s5, filters_hi, 1); sum = vmlal_lane_s16(sum, s6, filters_hi, 2); sum = vmlal_lane_s16(sum, s7, filters_hi, 3); - return sum; + + uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); + return vmin_u16(res, max); } static INLINE uint16x8_t @@ -111,10 +78,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t filters, const uint16x8_t max) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); - int32x4_t sum0, sum1; - uint16x8_t d; - sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0); + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3); @@ -122,7 +87,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3); - sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3); @@ -130,9 +96,152 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3); - d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7)); - d = vminq_u16(d, max); - return d; + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve_4tap_horiz_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = + highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], filter, max); + uint16x4_t d1 = + highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], filter, max); + uint16x4_t d2 = + highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], filter, max); + uint16x4_t d3 = + highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = + highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], filter, max); + uint16x8_t d1 = + highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], filter, max); + uint16x8_t d2 = + highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], filter, max); + uint16x8_t d3 = + highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void highbd_convolve_8tap_horiz_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[8], s1[8], s2[8], s3[8]; + load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filter, max); + uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filter, max); + uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filter, max); + uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filter, max); + uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filter, max); + uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filter, max); + uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } } void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, @@ -143,202 +252,25 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, if (x_step_q4 != 16) { vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); - } else { - const int16x8_t filters = vld1q_s16(filter[x0_q4]); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - uint16x8_t t0, t1, t2, t3; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3; - - if (h == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t d01, d23; - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u16_8x4(&t0, &t1, &t2, &t3); - s0 = vreinterpret_s16_u16(vget_low_u16(t0)); - s1 = vreinterpret_s16_u16(vget_low_u16(t1)); - s2 = vreinterpret_s16_u16(vget_low_u16(t2)); - s3 = vreinterpret_s16_u16(vget_low_u16(t3)); - s4 = vreinterpret_s16_u16(vget_high_u16(t0)); - s5 = vreinterpret_s16_u16(vget_high_u16(t1)); - s6 = vreinterpret_s16_u16(vget_high_u16(t2)); - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - src += 7; - - do { - load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); - transpose_s16_4x4d(&s7, &s8, &s9, &s10); - - d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + return; + } - d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - d01 = vminq_u16(d01, max); - d23 = vminq_u16(d23, max); - transpose_u16_4x4q(&d01, &d23); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); - vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); - vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); - vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); - vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - src += 4; - dst += 4; - w -= 4; - } while (w > 0); - } else { - int16x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3; - - if (w == 4) { - do { - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, - &t4, &t5, &t6, &t7); - src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = - highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - - transpose_u16_8x4(&d0, &d1, &d2, &d3); - vst1_u16(dst, vget_low_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d3)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d3)); - dst += dst_stride; - h -= 8; - } while (h > 0); - } else { - int width; - const uint16_t *s; - uint16_t *d; - int16x8_t s11, s12, s13, s14; - uint16x8_t d4, d5, d6, d7; - - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - - do { - load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, - &s12, &s13, &s14); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); - - d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, - max); - d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, - max); - d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, - max); - d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, - max); - d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, - max); - d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, - max); - d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, - max); - d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, - filters, max); - - transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - - s0 = s8; - s1 = s9; - s2 = s10; - s3 = s11; - s4 = s12; - s5 = s13; - s6 = s14; - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 8 * src_stride; - dst += 8 * dst_stride; - h -= 8; - } while (h > 0); - } - } + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2); + highbd_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap, bd); + } else { + const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]); + highbd_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap, bd); } } @@ -352,66 +284,233 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + + src -= 3; + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[8], s1[8], s2[8], s3[8]; + load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filters, max); + uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filters, max); + uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filters, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void highbd_convolve_4tap_vert_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, filter, max); + uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, filter, max); + uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, filter, max); + uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); } else { - const int16x8_t filters = vld1q_s16(filter[x0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3; - - if (h == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t t0, t1, t2, t3; - uint16x8_t d01, d23, t01, t23; - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u16_8x4(&t0, &t1, &t2, &t3); - s0 = vreinterpret_s16_u16(vget_low_u16(t0)); - s1 = vreinterpret_s16_u16(vget_low_u16(t1)); - s2 = vreinterpret_s16_u16(vget_low_u16(t2)); - s3 = vreinterpret_s16_u16(vget_low_u16(t3)); - s4 = vreinterpret_s16_u16(vget_high_u16(t0)); - s5 = vreinterpret_s16_u16(vget_high_u16(t1)); - s6 = vreinterpret_s16_u16(vget_high_u16(t2)); - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - src += 7; + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + + s += 3 * src_stride; do { - load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); - transpose_s16_4x4d(&s7, &s8, &s9, &s10); - - d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - t01 = vminq_u16(t01, max); - t23 = vminq_u16(t23, max); - transpose_u16_4x4q(&t01, &t23); - - d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), - vld1_u16(dst + 2 * dst_stride)); - d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), - vld1_u16(dst + 3 * dst_stride)); - d01 = vrhaddq_u16(d01, t01); - d23 = vrhaddq_u16(d23, t23); - - vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); - vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); - vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); - vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, filter, max); + uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, filter, max); + uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, filter, max); + uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_convolve_8tap_vert_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max); + uint16x4_t d1 = + highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max); + uint16x4_t d2 = + highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max); + uint16x4_t d3 = + highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = + highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max); + uint16x8_t d1 = + highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max); + uint16x8_t d2 = + highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max); + uint16x8_t d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -420,164 +519,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, s4 = s8; s5 = s9; s6 = s10; - src += 4; - dst += 4; - w -= 4; - } while (w > 0); - } else { - int16x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; - - if (w == 4) { - do { - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, - &t4, &t5, &t6, &t7); - src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - t3 = - highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - transpose_u16_8x4(&t0, &t1, &t2, &t3); - - d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), - vld1_u16(dst + 4 * dst_stride)); - d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), - vld1_u16(dst + 5 * dst_stride)); - d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), - vld1_u16(dst + 6 * dst_stride)); - d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride), - vld1_u16(dst + 7 * dst_stride)); - d0 = vrhaddq_u16(d0, t0); - d1 = vrhaddq_u16(d1, t1); - d2 = vrhaddq_u16(d2, t2); - d3 = vrhaddq_u16(d3, t3); - - vst1_u16(dst, vget_low_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d3)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d3)); - dst += dst_stride; - h -= 8; - } while (h > 0); - } else { - int width; - const uint16_t *s; - uint16_t *d; - int16x8_t s11, s12, s13, s14; - uint16x8_t d4, d5, d6, d7; - - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - - do { - load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, - &s12, &s13, &s14); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); - - d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, - max); - d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, - max); - d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, - max); - d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, - max); - d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, - max); - d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, - max); - d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, - max); - d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, - filters, max); - - transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - - d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); - d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); - d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); - d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); - d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride)); - d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride)); - d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride)); - d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride)); - - store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - - s0 = s8; - s1 = s9; - s2 = s10; - s3 = s11; - s4 = s12; - s5 = s13; - s6 = s14; - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 8 * src_stride; - dst += 8 * dst_stride; - h -= 8; - } while (h > 0); - } - } + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); } } @@ -589,160 +538,25 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, if (y_step_q4 != 16) { vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); - } else { - const int16x8_t filters = vld1q_s16(filter[y0_q4]); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3 * src_stride; - - if (w == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t d01, d23; - - s0 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s1 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s2 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s3 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s4 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s5 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s6 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; + return; + } - do { - s7 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s8 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s9 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s10 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - d01 = vminq_u16(d01, max); - d23 = vminq_u16(d23, max); - vst1_u16(dst, vget_low_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d23)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d23)); - dst += dst_stride; + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - h -= 4; - } while (h > 0); - } else { - int height; - const uint16_t *s; - uint16_t *d; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s1 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s2 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s3 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s4 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s5 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s6 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - d = dst; - height = h; - - do { - s7 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s8 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s9 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s10 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = - highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - - vst1q_u16(d, d0); - d += dst_stride; - vst1q_u16(d, d1); - d += dst_stride; - vst1q_u16(d, d2); - d += dst_stride; - vst1q_u16(d, d3); - d += dst_stride; - - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2); + highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, + dst_stride, w, h, y_filter_4tap, bd); + } else { + const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]); + highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap, bd); } } @@ -756,78 +570,89 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + + src -= 3 * src_stride; + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + uint16x4_t d1 = + highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + uint16x4_t d2 = + highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + uint16x4_t d3 = + highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); } else { - const int16x8_t filters = vld1q_s16(filter[y0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3 * src_stride; - - if (w == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t d01, d23, t01, t23; - - s0 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s1 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s2 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s3 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s4 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s5 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s6 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; do { - s7 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s8 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s9 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s10 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - t01 = vminq_u16(t01, max); - t23 = vminq_u16(t23, max); - - d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), - vld1_u16(dst + 1 * dst_stride)); - d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), - vld1_u16(dst + 3 * dst_stride)); - d01 = vrhaddq_u16(d01, t01); - d23 = vrhaddq_u16(d23, t23); - - vst1_u16(dst, vget_low_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d23)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d23)); - dst += dst_stride; + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = + highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + uint16x8_t d1 = + highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + uint16x8_t d2 = + highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + uint16x8_t d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -836,96 +661,592 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, s4 = s8; s5 = s9; s6 = s10; - h -= 4; - } while (h > 0); - } else { - int height; - const uint16_t *s; - uint16_t *d; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s1 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s2 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s3 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s4 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s5 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s6 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - d = dst; - height = h; - - do { - s7 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s8 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s9 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s10 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - t3 = - highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - - d0 = vld1q_u16(d + 0 * dst_stride); - d1 = vld1q_u16(d + 1 * dst_stride); - d2 = vld1q_u16(d + 2 * dst_stride); - d3 = vld1q_u16(d + 3 * dst_stride); - d0 = vrhaddq_u16(d0, t0); - d1 = vrhaddq_u16(d1, t1); - d2 = vrhaddq_u16(d2, t2); - d3 = vrhaddq_u16(d3, t3); - - vst1q_u16(d, d0); - d += dst_stride; - vst1q_u16(d, d1); - d += dst_stride; - vst1q_u16(d, d2); - d += dst_stride; - vst1q_u16(d, d3); - d += dst_stride; - - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } +static INLINE void highbd_convolve_2d_4tap_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, + const int16x4_t y_filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t h_s0[4], h_s1[4], h_s2[4]; + load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]); + + int16x4_t v_s0 = vreinterpret_s16_u16( + highbd_convolve4_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max)); + int16x4_t v_s1 = vreinterpret_s16_u16( + highbd_convolve4_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max)); + int16x4_t v_s2 = vreinterpret_s16_u16( + highbd_convolve4_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max)); + + s += 3 * src_stride; + + do { + int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4]; + load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], + &h_s3[3]); + load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], + &h_s4[3]); + load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], + &h_s5[3]); + load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], + &h_s6[3]); + + int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4( + h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max)); + int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4( + h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max)); + int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4( + h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max)); + int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4( + h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max)); + + uint16x4_t d0 = highbd_convolve4_4(v_s0, v_s1, v_s2, v_s3, y_filter, max); + uint16x4_t d1 = highbd_convolve4_4(v_s1, v_s2, v_s3, v_s4, y_filter, max); + uint16x4_t d2 = highbd_convolve4_4(v_s2, v_s3, v_s4, v_s5, y_filter, max); + uint16x4_t d3 = highbd_convolve4_4(v_s3, v_s4, v_s5, v_s6, y_filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + + return; + } + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t h_s0[4], h_s1[4], h_s2[4]; + load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]); + + int16x8_t v_s0 = vreinterpretq_s16_u16( + highbd_convolve4_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max)); + int16x8_t v_s1 = vreinterpretq_s16_u16( + highbd_convolve4_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max)); + int16x8_t v_s2 = vreinterpretq_s16_u16( + highbd_convolve4_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max)); + + s += 3 * src_stride; + + do { + int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4]; + load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], + &h_s3[3]); + load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], + &h_s4[3]); + load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], + &h_s5[3]); + load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], + &h_s6[3]); + + int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8( + h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max)); + int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8( + h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max)); + int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8( + h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max)); + int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8( + h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max)); + + uint16x8_t d0 = highbd_convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter, max); + uint16x8_t d1 = highbd_convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter, max); + uint16x8_t d2 = highbd_convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter, max); + uint16x8_t d3 = highbd_convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static INLINE void highbd_convolve_2d_8tap_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter, + const int16x8_t y_filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x4_t v_s0 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x4_t v_s1 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x4_t v_s2 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x4_t v_s3 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x4_t v_s4 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x4_t v_s5 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x4_t v_s6 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x4_t v_s7 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x4_t v_s8 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x4_t v_s9 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x4_t v_s10 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + + return; + } + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x8_t v_s0 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x8_t v_s1 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x8_t v_s2 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x8_t v_s3 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x8_t v_s4 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x8_t v_s5 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x8_t v_s6 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x8_t v_s7 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x8_t v_s8 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x8_t v_s9 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x8_t v_s10 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8; + const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1; + const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride; + + if (x_filter_taps == 4 && y_filter_taps == 4) { + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2); + + highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter, y_filter, bd); + return; + } + + const int16x8_t x_filter = vld1q_s16(filter[x0_q4]); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter, y_filter, bd); +} + +void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; } + + // Averaging convolution always uses an 8-tap filter. + const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; + const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride; + // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2 + // lines post both horizontally and vertically. + src = src - horiz_offset - vert_offset; + + const int16x8_t x_filter = vld1q_s16(filter[x0_q4]); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x4_t v_s0 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x4_t v_s1 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x4_t v_s2 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x4_t v_s3 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x4_t v_s4 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x4_t v_s5 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x4_t v_s6 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x4_t v_s7 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x4_t v_s8 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x4_t v_s9 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x4_t v_s10 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + + return; + } + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x8_t v_s0 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x8_t v_s1 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x8_t v_s2 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x8_t v_s3 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x8_t v_s4 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x8_t v_s5 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x8_t v_s6 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x8_t v_s7 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x8_t v_s8 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x8_t v_s9 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x8_t v_s10 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c new file mode 100644 index 0000000000..7fc0a57c90 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" + +DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6, + 1, 3, 5, 7 }; + +static INLINE uint16x4_t highbd_convolve4_4(const int16x4_t s[4], + const int16x8_t filter, + const uint16x4_t max) { + int16x8_t s01 = vcombine_s16(s[0], s[1]); + int16x8_t s23 = vcombine_s16(s[2], s[3]); + + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s01, filter, 0); + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s23, filter, 0); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS); + return vmin_u16(res_u16, max); +} + +static INLINE uint16x8_t highbd_convolve4_8(const int16x8_t s[4], + const int16x8_t filter, + const uint16x8_t max, + uint16x8_t idx) { + int64x2_t sum04 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[0], filter, 0); + int64x2_t sum15 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[1], filter, 0); + int64x2_t sum26 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[2], filter, 0); + int64x2_t sum37 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[3], filter, 0); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + + res = vpx_tbl_u16(res, idx); + + return vminq_u16(res, max); +} + +static INLINE uint16x4_t highbd_convolve8_4(const int16x8_t s[4], + const int16x8_t filter, + const uint16x4_t max) { + int64x2_t sum[4]; + + sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter); + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[2], sum[3]); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + + uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS); + return vmin_u16(res_u16, max); +} + +static INLINE uint16x8_t highbd_convolve8_8(const int16x8_t s[8], + const int16x8_t filter, + const uint16x8_t max) { + int64x2_t sum[8]; + + sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter); + sum[4] = vpx_dotq_s16(vdupq_n_s64(0), s[4], filter); + sum[5] = vpx_dotq_s16(vdupq_n_s64(0), s[5], filter); + sum[6] = vpx_dotq_s16(vdupq_n_s64(0), s[6], filter); + sum[7] = vpx_dotq_s16(vdupq_n_s64(0), s[7], filter); + + int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); + int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); + int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]); + int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve_4tap_horiz_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t filters, int bd) { + const int16x8_t filter = vcombine_s16(filters, vdup_n_s16(0)); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve4_4(s0, filter, max); + uint16x4_t d1 = highbd_convolve4_4(s1, filter, max); + uint16x4_t d2 = highbd_convolve4_4(s2, filter, max); + uint16x4_t d3 = highbd_convolve4_4(s3, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const uint16x8_t idx = vld1q_u16(kTblConv4_8); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = highbd_convolve4_8(s0, filter, max, idx); + uint16x8_t d1 = highbd_convolve4_8(s1, filter, max, idx); + uint16x8_t d2 = highbd_convolve4_8(s2, filter, max, idx); + uint16x8_t d3 = highbd_convolve4_8(s3, filter, max, idx); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void highbd_convolve_8tap_horiz_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filters, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve8_4(s0, filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1, filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2, filters, max); + uint16x4_t d3 = highbd_convolve8_4(s3, filters, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0, filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1, filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2, filters, max); + uint16x8_t d3 = highbd_convolve8_8(s3, filters, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_highbd_convolve8_horiz_sve(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2); + highbd_convolve_4tap_horiz_sve(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap, bd); + } else { + const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]); + highbd_convolve_8tap_horiz_sve(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap, bd); + } +} + +void vpx_highbd_convolve8_avg_horiz_sve(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + return; + } + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + + src -= 3; + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve8_4(s0, filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1, filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2, filters, max); + uint16x4_t d3 = highbd_convolve8_4(s3, filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0, filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1, filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2, filters, max); + uint16x8_t d3 = highbd_convolve8_8(s3, filters, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c new file mode 100644 index 0000000000..4ed7718f7d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" +#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h" + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 0, 5, 6, 7, 4, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 0, 1, 6, 7, 4, 5, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 0, 1, 2, 7, 4, 5, 6, +}; +// clang-format on + +static INLINE void transpose_concat_4x4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + int16x8_t res[2]) { + // Transpose 16-bit elements: + // s0: 00, 01, 02, 03 + // s1: 10, 11, 12, 13 + // s2: 20, 21, 22, 23 + // s3: 30, 31, 32, 33 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + + int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); + int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); + int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); + int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); + + int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q)); + int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q)); + + int32x4x2_t t0123 = vzipq_s32(s01, s23); + + res[0] = vreinterpretq_s16_s32(t0123.val[0]); + res[1] = vreinterpretq_s16_s32(t0123.val[1]); +} + +static INLINE void transpose_concat_8x4(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + int16x8_t res[4]) { + // Transpose 16-bit elements: + // s0: 00, 01, 02, 03, 04, 05, 06, 07 + // s1: 10, 11, 12, 13, 14, 15, 16, 17 + // s2: 20, 21, 22, 23, 24, 25, 26, 27 + // s3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + // res[2]: 04 14 24 34 05 15 25 35 + // res[3]: 06 16 26 36 07 17 27 37 + + int16x8x2_t s01 = vzipq_s16(s0, s1); + int16x8x2_t s23 = vzipq_s16(s2, s3); + + int32x4x2_t t0123_lo = vzipq_s32(vreinterpretq_s32_s16(s01.val[0]), + vreinterpretq_s32_s16(s23.val[0])); + int32x4x2_t t0123_hi = vzipq_s32(vreinterpretq_s32_s16(s01.val[1]), + vreinterpretq_s32_s16(s23.val[1])); + + res[0] = vreinterpretq_s16_s32(t0123_lo.val[0]); + res[1] = vreinterpretq_s16_s32(t0123_lo.val[1]); + res[2] = vreinterpretq_s16_s32(t0123_hi.val[0]); + res[3] = vreinterpretq_s16_s32(t0123_hi.val[1]); +} + +static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4], + int16x8_t res[4], uint16x8_t idx) { + res[0] = vpx_tbl2_s16(s0[0], s1[0], idx); + res[1] = vpx_tbl2_s16(s0[1], s1[1], idx); + res[2] = vpx_tbl2_s16(s0[2], s1[2], idx); + res[3] = vpx_tbl2_s16(s0[3], s1[3], idx); +} + +static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2], + int16x8_t res[2], uint16x8_t idx) { + res[0] = vpx_tbl2_s16(s0[0], s1[0], idx); + res[1] = vpx_tbl2_s16(s0[1], s1[1], idx); +} + +static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t s_lo[2], + int16x8_t s_hi[2], + int16x8_t filter, + uint16x4_t max) { + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0); + sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1); + + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0); + sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_v(const int16x8_t s_lo[4], + const int16x8_t s_hi[4], + const int16x8_t filter, + const uint16x8_t max) { + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0); + sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1); + + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0); + sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1); + + int64x2_t sum45 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[2], filter, 0); + sum45 = vpx_dotq_lane_s16(sum45, s_hi[2], filter, 1); + + int64x2_t sum67 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[3], filter, 0); + sum67 = vpx_dotq_lane_s16(sum67, s_hi[3], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_8tap_vert_sve2( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) { + assert(w >= 4 && h >= 4); + uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl); + + // Correct indices by the size of vector length. + merge_tbl_idx.val[0] = vaddq_u16( + merge_tbl_idx.val[0], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL))); + merge_tbl_idx.val[1] = vaddq_u16( + merge_tbl_idx.val[1], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL))); + merge_tbl_idx.val[2] = vaddq_u16( + merge_tbl_idx.val[2], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL))); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, sA; + + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + transpose_concat_4x4(s7, s8, s9, sA, s789A); + + vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); + vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); + vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]); + + uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filter, max); + uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filter, max); + uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filter, max); + uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, sA; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[4], s5678[5], s6789[4], s789A[4]; + transpose_concat_8x4(s7, s8, s9, sA, s789A); + + vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); + vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); + vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]); + + uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filter, max); + uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filter, max); + uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filter, max); + uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_highbd_convolve8_vert_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + vpx_highbd_convolve8_vert_neon(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + } else { + const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]); + highbd_convolve8_8tap_vert_sve2(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap, bd); + } +} + +void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + + src -= 3 * src_stride; + + uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl); + + // Correct indices by the size of vector length. + merge_tbl_idx.val[0] = vaddq_u16( + merge_tbl_idx.val[0], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL))); + merge_tbl_idx.val[1] = vaddq_u16( + merge_tbl_idx.val[1], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL))); + merge_tbl_idx.val[2] = vaddq_u16( + merge_tbl_idx.val[2], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL))); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, sA; + + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + transpose_concat_4x4(s7, s8, s9, sA, s789A); + + vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); + vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); + vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]); + + uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filters, max); + uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filters, max); + uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filters, max); + uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, sA; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[4], s5678[5], s6789[4], s789A[4]; + transpose_concat_8x4(s7, s8, s9, sA, s789A); + + vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); + vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); + vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]); + + uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filters, max); + uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filters, max); + uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filters, max); + uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filters, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c deleted file mode 100644 index 414ade3530..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2016 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/vpx_filter.h" -#include "vpx_ports/mem.h" - -void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, - uint16_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h, int bd) { - // + 1 to make it divisible by 4 - uint16_t temp[64 * 136]; - const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - /* Filter starting 3 lines back. The neon implementation will ignore the given - * height and filter a multiple of 4 lines. Since this goes in to the temp - * buffer which has lots of extra room and is subsequently discarded this is - * safe if somewhat less than ideal. */ - vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, - intermediate_height, bd); - - /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); -} - -void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, - uint16_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h, int bd) { - // + 1 to make it divisible by 4 - uint16_t temp[64 * 136]; - const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - /* This implementation has the same issues as above. In addition, we only want - * to average the values after both passes. - */ - vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, - intermediate_height, bd); - vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, - bd); -} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c index c54e588239..579096d78a 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c @@ -162,7 +162,7 @@ FUN_FLIP_SIGN(16, q_) // flip_sign_16 #define FUN_FLIP_SIGN_BACK(w, r) \ static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \ - const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \ + const int8x##w##_t sign_bit = vdup##r##n_s8((int8_t)0x80); \ return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \ } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h index 38b0b6c1a9..268c4bd962 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h @@ -154,11 +154,10 @@ static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) { static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x2_t a_u32; - if (stride == 4) return vld1_u8(buf); + uint32x2_t a_u32 = vdup_n_u32(0); memcpy(&a, buf, 4); buf += stride; - a_u32 = vdup_n_u32(a); + a_u32 = vset_lane_u32(a, a_u32, 0); memcpy(&a, buf, 4); a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); @@ -177,11 +176,10 @@ static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) { static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf, ptrdiff_t stride) { uint64_t a; - uint64x2_t a_u64; - if (stride == 4) return vld1q_u16(buf); + uint64x2_t a_u64 = vdupq_n_u64(0); memcpy(&a, buf, 8); buf += stride; - a_u64 = vdupq_n_u64(a); + a_u64 = vsetq_lane_u64(a, a_u64, 0); memcpy(&a, buf, 8); a_u64 = vsetq_lane_u64(a, a_u64, 1); return vreinterpretq_u16_u64(a_u64); @@ -191,10 +189,6 @@ static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf, static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { const uint32x2_t a_u32 = vreinterpret_u32_u8(a); - if (stride == 4) { - vst1_u8(buf, a); - return; - } uint32_to_mem(buf, vget_lane_u32(a_u32, 0)); buf += stride; uint32_to_mem(buf, vget_lane_u32(a_u32, 1)); @@ -204,11 +198,10 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x4_t a_u32; - if (stride == 4) return vld1q_u8(buf); + uint32x4_t a_u32 = vdupq_n_u32(0); memcpy(&a, buf, 4); buf += stride; - a_u32 = vdupq_n_u32(a); + a_u32 = vsetq_lane_u32(a, a_u32, 0); memcpy(&a, buf, 4); buf += stride; a_u32 = vsetq_lane_u32(a, a_u32, 1); @@ -225,10 +218,6 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride, const uint8x16_t a) { const uint32x4_t a_u32 = vreinterpretq_u32_u8(a); - if (stride == 4) { - vst1q_u8(buf, a); - return; - } uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0)); buf += stride; uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1)); @@ -449,6 +438,142 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p, vst1q_u8(s, s7); } +static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2) { + vst1_u16(s, s0); + s += p; + vst1_u16(s, s1); + s += p; + vst1_u16(s, s2); +} + +static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); +} + +static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); +} + +static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2, const uint16x4_t s3) { + vst1_u16(s, s0); + s += p; + vst1_u16(s, s1); + s += p; + vst1_u16(s, s2); + s += p; + vst1_u16(s, s3); +} + +static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3, int16x4_t *s4, int16x4_t *s5, + int16x4_t *s6) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); +} + +static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); +} + +static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); +} + +static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, + uint16x8_t *s3) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); +} + +static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2, const uint16x8_t s3) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); +} + +static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); +} + +static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3, int16x8_t *s4, int16x8_t *s5, + int16x8_t *s6) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); +} + static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, @@ -470,4 +595,46 @@ static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, *s7 = vld1q_u16(s); } +static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3, int16x4_t *s4, int16x4_t *s5, + int16x4_t *s6, int16x4_t *s7) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); + s += p; + *s7 = vld1_s16(s); +} + +static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3, int16x8_t *s4, int16x8_t *s5, + int16x8_t *s6, int16x8_t *s7) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); +} + #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c new file mode 100644 index 0000000000..a18cbbd736 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" + +uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size) { + if (size == 4) { + int16x4_t s[4]; + int64x2_t sum = vdupq_n_s64(0); + + s[0] = vld1_s16(src + 0 * stride); + s[1] = vld1_s16(src + 1 * stride); + s[2] = vld1_s16(src + 2 * stride); + s[3] = vld1_s16(src + 3 * stride); + + int16x8_t s01 = vcombine_s16(s[0], s[1]); + int16x8_t s23 = vcombine_s16(s[2], s[3]); + + sum = vpx_dotq_s16(sum, s01, s01); + sum = vpx_dotq_s16(sum, s23, s23); + + return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum)); + } else { + int rows = size; + int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + const int16_t *src_ptr = src; + int cols = size; + + do { + int16x8_t s[8]; + load_s16_8x8(src_ptr, stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + + sum[0] = vpx_dotq_s16(sum[0], s[0], s[0]); + sum[1] = vpx_dotq_s16(sum[1], s[1], s[1]); + sum[2] = vpx_dotq_s16(sum[2], s[2], s[2]); + sum[3] = vpx_dotq_s16(sum[3], s[3], s[3]); + sum[0] = vpx_dotq_s16(sum[0], s[4], s[4]); + sum[1] = vpx_dotq_s16(sum[1], s[5], s[5]); + sum[2] = vpx_dotq_s16(sum[2], s[6], s[6]); + sum[3] = vpx_dotq_s16(sum[3], s[7], s[7]); + + src_ptr += 8; + cols -= 8; + } while (cols); + + src += 8 * stride; + rows -= 8; + } while (rows); + + sum[0] = vaddq_s64(sum[0], sum[1]); + sum[2] = vaddq_s64(sum[2], sum[3]); + sum[0] = vaddq_s64(sum[0], sum[2]); + + return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum[0])); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h index 74f85a6bb6..c989a6721b 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h @@ -524,12 +524,20 @@ static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1, *a7 = vreinterpretq_s32_s64(c3.val[1]); } -// Note: Using 'd' registers or 'q' registers has almost identical speed. We use -// 'q' registers here to save some instructions. static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5, uint8x8_t *a6, uint8x8_t *a7) { - // Swap 8 bit elements. Goes from: + // Widen to 128-bit registers (usually a no-op once inlined.) + const uint8x16_t a0q = vcombine_u8(*a0, vdup_n_u8(0)); + const uint8x16_t a1q = vcombine_u8(*a1, vdup_n_u8(0)); + const uint8x16_t a2q = vcombine_u8(*a2, vdup_n_u8(0)); + const uint8x16_t a3q = vcombine_u8(*a3, vdup_n_u8(0)); + const uint8x16_t a4q = vcombine_u8(*a4, vdup_n_u8(0)); + const uint8x16_t a5q = vcombine_u8(*a5, vdup_n_u8(0)); + const uint8x16_t a6q = vcombine_u8(*a6, vdup_n_u8(0)); + const uint8x16_t a7q = vcombine_u8(*a7, vdup_n_u8(0)); + + // Zip 8 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 // a2: 20 21 22 23 24 25 26 27 @@ -539,43 +547,41 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, // a6: 60 61 62 63 64 65 66 67 // a7: 70 71 72 73 74 75 76 77 // to: - // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 - // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 - // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 - // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 - - const uint8x16x2_t b0 = - vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5)); - const uint8x16x2_t b1 = - vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7)); - - // Swap 16 bit elements resulting in: - // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 - // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 - // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 - // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 - - const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), - vreinterpretq_u16_u8(b1.val[0])); - const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), - vreinterpretq_u16_u8(b1.val[1])); - - // Unzip 32 bit elements resulting in: + // b0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // b1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // b2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // b3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const uint8x16_t b0 = vzipq_u8(a0q, a1q).val[0]; + const uint8x16_t b1 = vzipq_u8(a2q, a3q).val[0]; + const uint8x16_t b2 = vzipq_u8(a4q, a5q).val[0]; + const uint8x16_t b3 = vzipq_u8(a6q, a7q).val[0]; + + // Zip 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // c0.val[1]: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // c1.val[0]: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // c1.val[1]: 44 54 64 74 45 55 65 75 46 66 56 76 47 67 57 77 + const uint16x8x2_t c0 = + vzipq_u16(vreinterpretq_u16_u8(b0), vreinterpretq_u16_u8(b1)); + const uint16x8x2_t c1 = + vzipq_u16(vreinterpretq_u16_u8(b2), vreinterpretq_u16_u8(b3)); + + // Zip 32 bit elements resulting in: // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d0.val[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d1.val[0]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), + const uint32x4x2_t d0 = vzipq_u32(vreinterpretq_u32_u16(c0.val[0]), vreinterpretq_u32_u16(c1.val[0])); - const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), + const uint32x4x2_t d1 = vzipq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1])); *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); - *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); - *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); - *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); - *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *a2 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); + *a3 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *a4 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); + *a5 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c index 65fb67c984..037ea1142d 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c @@ -20,44 +20,36 @@ #include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" -// Note: -// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src). -// 2. After refactoring the shared code in kernel loops with inline functions, -// the decoder speed dropped a lot when using gcc compiler. Therefore there is -// no refactoring for those parts by now. -// 3. For horizontal convolve, there is an alternative optimization that -// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8 -// samples in each are read from memory: src, (src+1), (src+2), (src+3), -// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract -// instructions. This optimization is much faster in speed unit test, but slowed -// down the whole decoder by 5%. - -static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, int w, - int h, const int16x4_t filter) { +static INLINE void convolve_4tap_horiz_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const uint8x8_t x_filter = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1); + + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t filter_taps[4] = { vdup_lane_u8(x_filter, 2), + vdup_lane_u8(x_filter, 3), + vdup_lane_u8(x_filter, 4), + vdup_lane_u8(x_filter, 5) }; + if (w == 4) { do { - int16x4_t s0[4], s1[4]; - - int16x8_t t0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src))); - s0[0] = vget_low_s16(vextq_s16(t0, t0, 0)); - s0[1] = vget_low_s16(vextq_s16(t0, t0, 1)); - s0[2] = vget_low_s16(vextq_s16(t0, t0, 2)); - s0[3] = vget_low_s16(vextq_s16(t0, t0, 3)); + uint8x8_t s01[4]; - int16x8_t t1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + src_stride))); - s1[0] = vget_low_s16(vextq_s16(t1, t1, 0)); - s1[1] = vget_low_s16(vextq_s16(t1, t1, 1)); - s1[2] = vget_low_s16(vextq_s16(t1, t1, 2)); - s1[3] = vget_low_s16(vextq_s16(t1, t1, 3)); + s01[0] = load_unaligned_u8(src + 0, src_stride); + s01[1] = load_unaligned_u8(src + 1, src_stride); + s01[2] = load_unaligned_u8(src + 2, src_stride); + s01[3] = load_unaligned_u8(src + 3, src_stride); - int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter); - int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter); - uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter_taps); - store_u8(dst, dst_stride, d01); + store_unaligned_u8(dst, dst_stride, d01); src += 2 * src_stride; dst += 2 * dst_stride; @@ -70,25 +62,20 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src, int width = w; do { - int16x8_t t0[2], t1[2]; - int16x8_t s0[4], s1[4]; - - t0[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - t0[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 8))); - s0[0] = vextq_s16(t0[0], t0[1], 0); - s0[1] = vextq_s16(t0[0], t0[1], 1); - s0[2] = vextq_s16(t0[0], t0[1], 2); - s0[3] = vextq_s16(t0[0], t0[1], 3); - - t1[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride))); - t1[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride + 8))); - s1[0] = vextq_s16(t1[0], t1[1], 0); - s1[1] = vextq_s16(t1[0], t1[1], 1); - s1[2] = vextq_s16(t1[0], t1[1], 2); - s1[3] = vextq_s16(t1[0], t1[1], 3); - - uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter); - uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter); + uint8x8_t s0[4], s1[4]; + + s0[0] = vld1_u8(s + 0); + s0[1] = vld1_u8(s + 1); + s0[2] = vld1_u8(s + 2); + s0[3] = vld1_u8(s + 3); + + s1[0] = vld1_u8(s + src_stride + 0); + s1[1] = vld1_u8(s + src_stride + 1); + s1[2] = vld1_u8(s + src_stride + 2); + s1[3] = vld1_u8(s + src_stride + 3); + + uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter_taps); + uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter_taps); vst1_u8(d, d0); vst1_u8(d + dst_stride, d1); @@ -103,47 +90,41 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src, } } -static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, int w, - int h, const int16x8_t filter) { - uint8x8_t t0, t1, t2, t3; - +static INLINE void convolve_8tap_horiz_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { if (h == 4) { - uint8x8_t d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3; load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); - s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + src += 7; do { - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u8_8x4(&t0, &t1, &t2, &t3); - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + transpose_u8_8x4(&t7, &t8, &t9, &t10); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); transpose_u8_4x4(&d01, &d23); @@ -162,52 +143,33 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src, w -= 4; } while (w != 0); } else { - int width; - const uint8_t *s; - uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - if (w == 4) { do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); transpose_u8_8x4(&d04, &d15, &d26, &d37); @@ -216,57 +178,53 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src, store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); + src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } while (h > 0); } else { - uint8_t *d; - uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; - int16x8_t s11, s12, s13, s14; - do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7; + uint8_t *d = dst; + int width = w; do { - load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); - s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); + uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; + load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, + &t15); + + transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); + uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); + uint8x8_t d6 = + convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); + uint8x8_t d7 = + convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); @@ -304,17 +262,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, (void)y0_q4; (void)y_step_q4; + const int16x8_t x_filter = vld1q_s16(filter[x0_q4]); + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. - */ - const int16x4_t x_filter_4tap = vshr_n_s16(vld1_s16(filter[x0_q4] + 2), 1); - vpx_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h, - x_filter_4tap); + convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h, + x_filter); } else { - const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]); - vpx_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h, - x_filter_8tap); + convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h, + x_filter); } } @@ -324,7 +279,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { const int16x8_t filters = vld1q_s16(filter[x0_q4]); - uint8x8_t t0, t1, t2, t3; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -337,48 +291,41 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3; if (h == 4) { - uint8x8_t d01, d23, dd01, dd23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); + uint8x8_t t0, t1, t2, t3; load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); - s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + src += 7; do { - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u8_8x4(&t0, &t1, &t2, &t3); - s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); - s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); - s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); - s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + transpose_u8_8x4(&t7, &t8, &t9, &t10); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); transpose_u8_4x4(&d01, &d23); - dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride); - dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride); + uint8x8_t dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -398,61 +345,40 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, w -= 4; } while (w != 0); } else { - int width; - const uint8_t *s; - uint8x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - if (w == 4) { - uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37; - do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); transpose_u8_8x4(&d04, &d15, &d26, &d37); - dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride); - dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride); - dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride); - dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride); + uint8x8_t dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride); + uint8x8_t dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride); + uint8x8_t dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride); + uint8x8_t dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride); d04 = vrhadd_u8(d04, dd04); d15 = vrhadd_u8(d15, dd15); @@ -464,65 +390,54 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); + src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } while (h != 0); } else { - uint8_t *d; - uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; - int16x8_t s11, s12, s13, s14; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7; + uint8_t *d = dst; + int width = w; do { - load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); - s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); + uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; + load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, + &t15); + + transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + uint8x8_t d5 = + convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + uint8x8_t d6 = + convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + uint8x8_t d7 = + convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); @@ -556,152 +471,37 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } -static INLINE void vpx_convolve_4tap_vert_neon(const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, int w, - int h, const int16x4_t filter) { - if (w == 4) { - uint8x8_t t0, t1, t2, t3, d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, d0, d1, d2, d3; - - load_u8_8x3(src, src_stride, &t0, &t1, &t2); - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - - src += 3 * src_stride; - - do { - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - - d0 = convolve4_4(s0, s1, s2, s3, filter); - d1 = convolve4_4(s1, s2, s3, s4, filter); - d2 = convolve4_4(s2, s3, s4, s5, filter); - d3 = convolve4_4(s3, s4, s5, s6, filter); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - s0 = s4; - s1 = s5; - s2 = s6; - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - int height; - const uint8_t *s; - uint8_t *d; - uint8x8_t t0, t1, t2, t3, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6; - - do { - load_u8_8x3(src, src_stride, &t0, &t1, &t2); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - - s = src + 3 * src_stride; - d = dst; - height = h; - - do { - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s3 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - - d0 = convolve4_8(s0, s1, s2, s3, filter); - d1 = convolve4_8(s1, s2, s3, s4, filter); - d2 = convolve4_8(s2, s3, s4, s5, filter); - d3 = convolve4_8(s3, s4, s5, s6, filter); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s0 = s4; - s1 = s5; - s2 = s6; - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src += 8; - dst += 8; - w -= 8; - } while (w != 0); - } -} - -static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, int w, - int h, const int16x8_t filter) { +static INLINE void convolve_8tap_vert_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { if (w == 4) { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); src += 7 * src_stride; do { - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -718,54 +518,33 @@ static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src, h -= 4; } while (h != 0); } else { - int height; - const uint8_t *s; - uint8_t *d; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - s = src + 7 * src_stride; - d = dst; - height = h; + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7 * src_stride; + uint8_t *d = dst; + int height = h; do { - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -800,17 +579,14 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, (void)x_step_q4; (void)y_step_q4; + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. - */ - const int16x4_t y_filter_4tap = vshr_n_s16(vld1_s16(filter[y0_q4] + 2), 1); - vpx_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, - w, h, y_filter_4tap); + convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h, + y_filter); } else { - const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]); - vpx_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, - dst_stride, w, h, y_filter_8tap); + convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, dst_stride, + w, h, y_filter); } } @@ -832,45 +608,35 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); src += 7 * src_stride; do { - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); - s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); - s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); - s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -890,54 +656,33 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - int height; - const uint8_t *s; - uint8_t *d; - uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); - - s = src + 7 * src_stride; - d = dst; - height = h; + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7 * src_stride; + uint8_t *d = dst; + int height = h; do { - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride)); d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride)); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h index 4ecaee0f99..10cc761ccd 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h @@ -17,360 +17,6 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_filter.h" -#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) - -void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, - ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, - int y_step_q4, int w, int h); - -static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples, - const int32x4_t correction, - const int8x8_t filters) { - /* Accumulate dot product into 'correction' to account for range clamp. */ - int32x4_t sum = vdotq_lane_s32(correction, samples, filters, 0); - - /* Further narrowing and packing is performed by the caller. */ - return vmovn_s32(sum); -} - -static INLINE int16x4_t convolve4_4_sdot(const uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16_t permute_tbl) { - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - int8x16_t clamped_samples = - vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0); - - /* Further narrowing and packing is performed by the caller. */ - return vmovn_s32(sum); -} - -static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo, - const int8x16_t samples_hi, - const int32x4_t correction, - const int8x8_t filters) { - /* Sample range-clamping and permutation are performed by the caller. */ - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - int32x4_t sum0 = vdotq_lane_s32(correction, samples_lo, filters, 0); - /* Second 4 output values. */ - int32x4_t sum1 = vdotq_lane_s32(correction, samples_hi, filters, 0); - - /* Narrow and re-pack. */ - int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); - /* We halved the filter values so -1 from right shift. */ - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - -static INLINE uint8x8_t convolve4_8_sdot(const uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x2_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[2]; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - int32x4_t sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); - /* Second 4 output values. */ - int32x4_t sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); - - /* Narrow and re-pack. */ - int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); - /* We halved the filter values so -1 from right shift. */ - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - -static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, - const int8x16_t samples_hi, - const int32x4_t correction, - const int8x8_t filters) { - /* Sample range-clamping and permutation are performed by the caller. */ - int32x4_t sum; - - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vdotq_lane_s32(correction, samples_lo, filters, 0); - sum = vdotq_lane_s32(sum, samples_hi, filters, 1); - - /* Further narrowing and packing is performed by the caller. */ - return vqmovn_s32(sum); -} - -static INLINE int16x4_t convolve8_4_sdot(const uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x2_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[2]; - int32x4_t sum; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); - sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1); - - /* Further narrowing and packing is performed by the caller. */ - return vqmovn_s32(sum); -} - -static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, - const int8x16_t samples0_hi, - const int8x16_t samples1_lo, - const int8x16_t samples1_hi, - const int32x4_t correction, - const int8x8_t filters) { - /* Sample range-clamping and permutation are performed by the caller. */ - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0); - sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); - /* Second 4 output values. */ - sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0); - sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, FILTER_BITS); -} - -static INLINE uint8x8_t convolve8_8_sdot(const uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x3_t permute_tbl) { - int8x16_t clamped_samples, permuted_samples[3]; - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); - /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ - permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); - - /* Accumulate dot product into 'correction' to account for range clamp. */ - /* First 4 output values. */ - sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); - sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1); - /* Second 4 output values. */ - sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); - sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, FILTER_BITS); -} - -#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) - -#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) - -void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h); - -static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples, - const int8x8_t filters) { - int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0); - - /* Further narrowing and packing is performed by the caller. */ - return vmovn_s32(sum); -} - -static INLINE int16x4_t convolve4_4_usdot(const uint8x16_t samples, - const int8x8_t filters, - const uint8x16_t permute_tbl) { - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); - - int32x4_t sum = - vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0); - - /* Further narrowing and packing is performed by the caller. */ - return vmovn_s32(sum); -} - -static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo, - const uint8x16_t samples_hi, - const int8x8_t filters) { - /* Sample permutation is performed by the caller. */ - /* First 4 output values. */ - int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); - /* Second 4 output values. */ - int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples_hi, filters, 0); - - /* Narrow and re-pack. */ - int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); - /* We halved the filter values so -1 from right shift. */ - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - -static INLINE uint8x8_t convolve4_8_usdot(const uint8x16_t samples, - const int8x8_t filters, - const uint8x16x2_t permute_tbl) { - uint8x16_t permuted_samples[2]; - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - - /* First 4 output values. */ - int32x4_t sum0 = - vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); - /* Second 4 output values. */ - int32x4_t sum1 = - vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); - - /* Narrow and re-pack. */ - int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); - /* We halved the filter values so -1 from right shift. */ - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - -static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, - const uint8x16_t samples_hi, - const int8x8_t filters) { - /* Sample permutation is performed by the caller. */ - int32x4_t sum; - - sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); - sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); - - /* Further narrowing and packing is performed by the caller. */ - return vqmovn_s32(sum); -} - -static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples, - const int8x8_t filters, - const uint8x16x2_t permute_tbl) { - uint8x16_t permuted_samples[2]; - int32x4_t sum; - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - - sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); - sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); - - /* Further narrowing and packing is performed by the caller. */ - return vqmovn_s32(sum); -} - -static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, - const uint8x16_t samples0_hi, - const uint8x16_t samples1_lo, - const uint8x16_t samples1_hi, - const int8x8_t filters) { - /* Sample permutation is performed by the caller. */ - int32x4_t sum0, sum1; - int16x8_t sum; - - /* First 4 output values. */ - sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); - sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); - /* Second 4 output values. */ - sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); - sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, FILTER_BITS); -} - -static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples, - const int8x8_t filters, - const uint8x16x3_t permute_tbl) { - uint8x16_t permuted_samples[3]; - int32x4_t sum0, sum1; - int16x8_t sum; - - /* Permute samples ready for dot product. */ - /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ - permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); - /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ - permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ - permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); - - /* First 4 output values. */ - sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); - sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); - /* Second 4 output values. */ - sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); - sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); - - /* Narrow and re-pack. */ - sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); - return vqrshrun_n_s16(sum, FILTER_BITS); -} - -#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) - -static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1, - const int16x4_t s2, const int16x4_t s3, - const int16x4_t filters) { - int16x4_t sum = vmul_lane_s16(s0, filters, 0); - sum = vmla_lane_s16(sum, s1, filters, 1); - sum = vmla_lane_s16(sum, s2, filters, 2); - sum = vmla_lane_s16(sum, s3, filters, 3); - return sum; -} - -static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1, - const int16x8_t s2, const int16x8_t s3, - const int16x4_t filters) { - int16x8_t sum = vmulq_lane_s16(s0, filters, 0); - sum = vmlaq_lane_s16(sum, s1, filters, 1); - sum = vmlaq_lane_s16(sum, s2, filters, 2); - sum = vmlaq_lane_s16(sum, s3, filters, 3); - /* We halved the filter values so -1 from right shift. */ - return vqrshrun_n_s16(sum, FILTER_BITS - 1); -} - static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, @@ -428,4 +74,99 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, filters); } +// 2-tap (bilinear) filter values are always positive, but 4-tap filter values +// are negative on the outer edges (taps 0 and 3), with taps 1 and 2 having much +// greater positive values to compensate. To use instructions that operate on +// 8-bit types we also need the types to be unsigned. Subtracting the products +// of taps 0 and 3 from the products of taps 1 and 2 always works given that +// 2-tap filters are 0-padded. +static INLINE uint8x8_t convolve4_8(const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t filter_taps[4]) { + uint16x8_t sum = vmull_u8(s1, filter_taps[1]); + sum = vmlal_u8(sum, s2, filter_taps[2]); + sum = vmlsl_u8(sum, s0, filter_taps[0]); + sum = vmlsl_u8(sum, s3, filter_taps[3]); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_BITS - 1); +} + +static INLINE void convolve_4tap_vert_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const uint8x8_t y_filter = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1); + + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t filter_taps[4] = { vdup_lane_u8(y_filter, 2), + vdup_lane_u8(y_filter, 3), + vdup_lane_u8(y_filter, 4), + vdup_lane_u8(y_filter, 5) }; + + if (w == 4) { + uint8x8_t s01 = load_unaligned_u8(src + 0 * src_stride, src_stride); + uint8x8_t s12 = load_unaligned_u8(src + 1 * src_stride, src_stride); + + src += 2 * src_stride; + + do { + uint8x8_t s23 = load_unaligned_u8(src + 0 * src_stride, src_stride); + uint8x8_t s34 = load_unaligned_u8(src + 1 * src_stride, src_stride); + uint8x8_t s45 = load_unaligned_u8(src + 2 * src_stride, src_stride); + uint8x8_t s56 = load_unaligned_u8(src + 3 * src_stride, src_stride); + + uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter_taps); + uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter_taps); + + store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01); + store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23); + + s01 = s45; + s12 = s56; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x8_t s0, s1, s2; + load_u8_8x3(s, src_stride, &s0, &s1, &s2); + + s += 3 * src_stride; + + do { + uint8x8_t s3, s4, s5, s6; + load_u8_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter_taps); + uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter_taps); + uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter_taps); + uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter_taps); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + #endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c index 00bac3b9cf..b05a49d3fe 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -20,270 +20,139 @@ #include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" +// Filter values always sum to 128. +#define FILTER_SUM 128 + DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ + // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ + // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ + // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE void vpx_convolve_4tap_2d_horiz_neon_dotprod( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x16_t range_limit) { - uint8x16_t s0, s1, s2, s3; - - if (w == 4) { - const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - do { - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - load_u8_16x3(src, src_stride, &s0, &s1, &s2); - - d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8_4x1(dst + 2 * dst_stride, d23); - } else { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - width = w; - s = src; - d = dst; - do { - load_u8_16x3(s, src_stride, &s0, &s1, &s2); +static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); - d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); - store_u8_8x3(d, dst_stride, d0, d1, d2); + // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide + // by 2 since we halved the filter values.) + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0); - s += 8; - d += 8; - width -= 8; - } while (width != 0); - } + // Further narrowing and packing is performed by the caller. + return vmovn_s32(sum); } -static INLINE void vpx_convolve_8tap_2d_horiz_neon_dotprod( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x16_t range_limit) { - uint8x16_t s0, s1, s2, s3; - - if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - do { - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - load_u8_16x3(src, src_stride, &s0, &s1, &s2); - - d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8_4x1(dst + 2 * dst_stride, d23); - } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - width = w; - s = src; - d = dst; - do { - load_u8_16x3(s, src_stride, &s0, &s1, &s2); - - d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); - - store_u8_8x3(d, dst_stride, d0, d1, d2); - - s += 8; - d += 8; - width -= 8; - } while (width != 0); - } +static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide + // by 2 since we halved the filter values.) + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); } -void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src, - ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, - int y_step_q4, int w, int h) { - const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int32x4_t correction_8tap = - vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS))); - const uint8x16_t range_limit = vdupq_n_u8(128); - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ - const int8x8_t x_filter_4tap = - vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); - const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); - vpx_convolve_4tap_2d_horiz_neon_dotprod(src - 1, src_stride, dst, - dst_stride, w, h, x_filter_4tap, - correction_4tap, range_limit); +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1); + + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); +} - } else { - vpx_convolve_8tap_2d_horiz_neon_dotprod(src - 3, src_stride, dst, - dst_stride, w, h, x_filter_8tap, - correction_8tap, range_limit); - } +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]), + vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; + + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); } -static INLINE void vpx_convolve_4tap_horiz_neon_dotprod( +static INLINE void convolve_4tap_horiz_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x16_t range_limit) { - uint8x16_t s0, s1, s2, s3; - + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { if (w == 4) { - const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl); - t1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl); - t2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl); - t3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -293,23 +162,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod( h -= 4; } while (h != 0); } else { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl); + uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -324,26 +191,22 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod( } } -static INLINE void vpx_convolve_8tap_horiz_neon_dotprod( +static INLINE void convolve_8tap_horiz_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x16_t range_limit) { - uint8x16_t s0, s1, s2, s3; - + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); - t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); - t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); - t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -353,23 +216,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_dotprod( h -= 4; } while (h != 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -389,11 +250,6 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int32x4_t correction_8tap = - vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS))); - const uint8x16_t range_limit = vdupq_n_u8(128); - assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); @@ -403,21 +259,21 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, (void)y_step_q4; if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ + // Load 4-tap filter into first 4 elements of the vector. + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); const int8x8_t x_filter_4tap = - vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); - const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); - vpx_convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, - w, h, x_filter_4tap, correction_4tap, - range_limit); + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap); } else { - vpx_convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, - w, h, x_filter_8tap, correction_8tap, - range_limit); + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap); } } @@ -428,10 +284,6 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x16_t range_limit = vdupq_n_u8(128); - uint8x16_t s0, s1, s2, s3; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -444,22 +296,21 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, src -= 3; if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23, dd01, dd23; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl); - t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl); - t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl); - t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -472,24 +323,23 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, h -= 4; } while (h != 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl); - d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl); - d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl); - d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl); + uint8x8_t dd0, dd1, dd2, dd3; load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); d0 = vrhadd_u8(d0, dd0); @@ -511,260 +361,142 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, } static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, - int8x8_t a3, int8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b = vqtbl2q_s8(samples, permute_tbl); + int8x8_t a3, int8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0]; + + *b = vreinterpretq_s8_s16(a0123); } static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, int8x8_t a3, int8x16_t *b0, - int8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; - *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); + int8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; + int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; + + int16x8x2_t a0123 = + vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)); + + *b0 = vreinterpretq_s8_s16(a0123.val[0]); + *b1 = vreinterpretq_s8_s16(a0123.val[1]); } -static INLINE void vpx_convolve_4tap_vert_neon_dotprod( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x8_t range_limit) { - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - src += 7 * src_stride; +static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0); + sum = vdotq_lane_s32(sum, samples_hi, filters, 1); - do { - uint8x8_t t7, t8, t9, t10; - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - d0 = convolve4_4_sdot_partial(s0123, correction, filter); - d1 = convolve4_4_sdot_partial(s1234, correction, filter); - d2 = convolve4_4_sdot_partial(s2345, correction, filter); - d3 = convolve4_4_sdot_partial(s3456, correction, filter); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s0123 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; - - load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); - s += 7 * src_stride; - - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - - do { - uint8x8_t t7, t8, t9, t10; - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - d0 = convolve4_8_sdot_partial(s0123_lo, s0123_hi, correction, filter); - d1 = convolve4_8_sdot_partial(s1234_lo, s1234_hi, correction, filter); - d2 = convolve4_8_sdot_partial(s2345_lo, s2345_hi, correction, filter); - d3 = convolve4_8_sdot_partial(s3456_lo, s3456_hi, correction, filter); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s0123_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - s3456_lo = s78910_lo; - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s0123_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - s3456_hi = s78910_hi; + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); +} - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src += 8; - dst += 8; - w -= 8; - } while (w != 0); - } +static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. + + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0); + sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0); + sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); } -static INLINE void vpx_convolve_8tap_vert_neon_dotprod( +static INLINE void convolve_8tap_vert_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter, - const int32x4_t correction, const uint8x8_t range_limit) { + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + int8x16_t s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456, s78910 } }; + int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -781,83 +513,70 @@ static INLINE void vpx_convolve_8tap_vert_neon_dotprod( h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - do { - height = h; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + int8x16_t s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); samples_LUT.val[0] = s3456_hi; samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filter); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filter); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filter); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filter); + int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; @@ -883,11 +602,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); - const int32x4_t correction_8tap = - vdupq_n_s32(vaddlvq_s16(vshll_n_s8(y_filter_8tap, FILTER_BITS))); - const uint8x8_t range_limit = vdup_n_u8(128); - assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); @@ -897,20 +611,15 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, (void)y_step_q4; if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ - const int8x8_t y_filter_4tap = - vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2); - const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1); - vpx_convolve_4tap_vert_neon_dotprod(src - src_stride, src_stride, dst, - dst_stride, w, h, y_filter_4tap, - correction_4tap, range_limit); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h, + y_filter); } else { - vpx_convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst, - dst_stride, w, h, y_filter_8tap, - correction_8tap, range_limit); + const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter); } } @@ -921,13 +630,7 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); - const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128); - const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); - const uint8x8_t range_limit = vdup_n_u8(128); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -940,59 +643,54 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, src -= 3 * src_stride; if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23, dd01, dd23; - + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + int8x16_t s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456, s78910 } }; + int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filters); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filters); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filters); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -1000,8 +698,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -1012,79 +710,67 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - const uint8_t *s; - uint8_t *d; - int height; - do { - height = h; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; - /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + int8x16_t s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); samples_LUT.val[0] = s3456_hi; samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - + int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters); + + uint8x8_t dd0, dd1, dd2, dd3; load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); d0 = vrhadd_u8(d0, dd0); @@ -1094,8 +780,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; @@ -1115,3 +801,275 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, } while (w != 0); } } + +static INLINE void convolve_4tap_2d_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t x_filter, + const uint8x8_t y_filter) { + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2), + vdup_lane_u8(y_filter, 3), + vdup_lane_u8(y_filter, 4), + vdup_lane_u8(y_filter, 5) }; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); + + int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1); + + src += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl); + int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl); + int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl); + int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1); + uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1); + uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4); + uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4); + + uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps); + uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps); + + store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01); + store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23); + + v_s01 = v_s45; + v_s12 = v_s56; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2); + + uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl); + uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl); + uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl); + + s += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl); + uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl); + uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl); + uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl); + + uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps); + uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps); + uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps); + uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void convolve_8tap_2d_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + uint8x16_t s0, s1, s2; + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = + vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2; + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + + const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8; + const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1; + const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride; + + if (x_filter_taps == 4 && y_filter_taps == 4) { + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t x_filter_4tap = + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + const uint8x8_t y_filter_4tap = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1); + + convolve_4tap_2d_neon_dotprod(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter_4tap, + y_filter_4tap); + return; + } + + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7). + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + const int im_height = h + SUBPEL_TAPS - 1; + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_2d_horiz_neon_dotprod(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, + im_height, x_filter_8tap); + + convolve_8tap_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, w, h, + y_filter_8tap); +} + +void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + + // Averaging convolution always uses an 8-tap filter. + // Account for the vertical phase needing 3 lines prior and 4 lines post. + const int im_height = h + SUBPEL_TAPS - 1; + const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_2d_horiz_neon_dotprod(src - offset - offset * src_stride, + src_stride, im_block, im_stride, w, + im_height, x_filter_8tap); + + vpx_convolve8_avg_vert_neon_dotprod(im_block + offset * im_stride, im_stride, + dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c index bcad1dd121..e582004133 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -26,255 +26,112 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { - 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, - 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 -}; - DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { - /* Shift left and insert new last column in transposed 4x4 block. */ + // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, - /* Shift left and insert two new columns in transposed 4x4 block. */ + // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, - /* Shift left and insert three new columns in transposed 4x4 block. */ + // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE void vpx_convolve_4tap_2d_horiz_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { - uint8x16_t s0, s1, s2, s3; +static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); - if (w == 4) { - const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0); - do { - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve4_4_usdot(s0, filter, perm_tbl); - d1 = convolve4_4_usdot(s1, filter, perm_tbl); - d2 = convolve4_4_usdot(s2, filter, perm_tbl); - d3 = convolve4_4_usdot(s3, filter, perm_tbl); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - load_u8_16x3(src, src_stride, &s0, &s1, &s2); - - d0 = convolve4_4_usdot(s0, filter, perm_tbl); - d1 = convolve4_4_usdot(s1, filter, perm_tbl); - d2 = convolve4_4_usdot(s2, filter, perm_tbl); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8_4x1(dst + 2 * dst_stride, d23); - } else { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve4_8_usdot(s0, filter, perm_tbl); - d1 = convolve4_8_usdot(s1, filter, perm_tbl); - d2 = convolve4_8_usdot(s2, filter, perm_tbl); - d3 = convolve4_8_usdot(s3, filter, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - width = w; - s = src; - d = dst; - do { - load_u8_16x3(s, src_stride, &s0, &s1, &s2); - - d0 = convolve4_8_usdot(s0, filter, perm_tbl); - d1 = convolve4_8_usdot(s1, filter, perm_tbl); - d2 = convolve4_8_usdot(s2, filter, perm_tbl); - - store_u8_8x3(d, dst_stride, d0, d1, d2); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - } + // Further narrowing and packing is performed by the caller. + return vmovn_s32(sum); } -static INLINE void vpx_convolve_8tap_2d_horiz_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { - uint8x16_t s0, s1, s2, s3; - - if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - do { - load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_4_usdot(s0, filter, perm_tbl); - d1 = convolve8_4_usdot(s1, filter, perm_tbl); - d2 = convolve8_4_usdot(s2, filter, perm_tbl); - d3 = convolve8_4_usdot(s3, filter, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - load_u8_16x3(src, src_stride, &s0, &s1, &s2); - - d0 = convolve8_4_usdot(s0, filter, perm_tbl); - d1 = convolve8_4_usdot(s1, filter, perm_tbl); - d2 = convolve8_4_usdot(s2, filter, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8_4x1(dst + 2 * dst_stride, d23); - } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; - - do { - width = w; - s = src; - d = dst; - do { - load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - - d0 = convolve8_8_usdot(s0, filter, perm_tbl); - d1 = convolve8_8_usdot(s1, filter, perm_tbl); - d2 = convolve8_8_usdot(s2, filter, perm_tbl); - d3 = convolve8_8_usdot(s3, filter, perm_tbl); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h > 3); - - /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for - * further details on possible values of block height. */ - width = w; - s = src; - d = dst; - do { - load_u8_16x3(s, src_stride, &s0, &s1, &s2); - - d0 = convolve8_8_usdot(s0, filter, perm_tbl); - d1 = convolve8_8_usdot(s1, filter, perm_tbl); - d2 = convolve8_8_usdot(s2, filter, perm_tbl); - - store_u8_8x3(d, dst_stride, d0, d1, d2); - - s += 8; - d += 8; - width -= 8; - } while (width > 0); - } +static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; + + // First 4 output values. + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + // Second 4 output values. + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); } -void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); - - assert((intptr_t)dst % 4 == 0); - assert(dst_stride % 4 == 0); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y0_q4; - (void)y_step_q4; - - if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ - const int8x8_t x_filter_4tap = - vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); - vpx_convolve_4tap_2d_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, - w, h, x_filter_4tap); - - } else { - vpx_convolve_8tap_2d_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, - w, h, x_filter_8tap); - } +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; + + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); + + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); } -static INLINE void vpx_convolve_4tap_horiz_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { - uint8x16_t s0, s1, s2, s3; +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]), + vqtbl1q_u8(samples, permute_tbl.val[2]) }; + + // First 4 output values. + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} +static INLINE void convolve_4tap_horiz_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t filter) { if (w == 4) { - const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve4_4_usdot(s0, filter, perm_tbl); - t1 = convolve4_4_usdot(s1, filter, perm_tbl); - t2 = convolve4_4_usdot(s2, filter, perm_tbl); - t3 = convolve4_4_usdot(s3, filter, perm_tbl); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -284,23 +141,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm( h -= 4; } while (h != 0); } else { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve4_8_usdot(s0, filter, perm_tbl); - d1 = convolve4_8_usdot(s1, filter, perm_tbl); - d2 = convolve4_8_usdot(s2, filter, perm_tbl); - d3 = convolve4_8_usdot(s3, filter, perm_tbl); + uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -315,25 +170,24 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm( } } -static INLINE void vpx_convolve_8tap_horiz_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { - uint8x16_t s0, s1, s2, s3; - +static INLINE void convolve_8tap_horiz_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t filter) { if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_usdot(s0, filter, perm_tbl); - t1 = convolve8_4_usdot(s1, filter, perm_tbl); - t2 = convolve8_4_usdot(s2, filter, perm_tbl); - t3 = convolve8_4_usdot(s3, filter, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); @@ -343,23 +197,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_i8mm( h -= 4; } while (h != 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filter, perm_tbl); - d1 = convolve8_8_usdot(s1, filter, perm_tbl); - d2 = convolve8_8_usdot(s2, filter, perm_tbl); - d3 = convolve8_8_usdot(s3, filter, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); @@ -379,8 +231,6 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); - assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); @@ -390,18 +240,21 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, (void)y_step_q4; if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ + // Load 4-tap filter into first 4 elements of the vector. + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); const int8x8_t x_filter_4tap = - vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2); - vpx_convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, - h, x_filter_4tap); + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap); } else { - vpx_convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, - h, x_filter_8tap); + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap); } } @@ -411,7 +264,6 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); - uint8x16_t s0, s1, s2, s3; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -424,22 +276,21 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, src -= 3; if (w == 4) { - const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); - do { - int16x4_t t0, t1, t2, t3; - uint8x8_t d01, d23, dd01, dd23; + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); - t0 = convolve8_4_usdot(s0, filters, perm_tbl); - t1 = convolve8_4_usdot(s1, filters, perm_tbl); - t2 = convolve8_4_usdot(s2, filters, perm_tbl); - t3 = convolve8_4_usdot(s3, filters, perm_tbl); - d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -452,24 +303,23 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); - const uint8_t *s; - uint8_t *d; - int width; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); do { - width = w; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + do { + uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_usdot(s0, filters, perm_tbl); - d1 = convolve8_8_usdot(s1, filters, perm_tbl); - d2 = convolve8_8_usdot(s2, filters, perm_tbl); - d3 = convolve8_8_usdot(s3, filters, perm_tbl); + uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl); + uint8x8_t dd0, dd1, dd2, dd3; load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); d0 = vrhadd_u8(d0, dd0); @@ -492,216 +342,130 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b, - const uint8x16_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, XX, XX, XX, XX - * a1: 10, 11, 12, 13, XX, XX, XX, XX - * a2: 20, 21, 22, 23, XX, XX, XX, XX - * a3: 30, 31, 32, 33, XX, XX, XX, XX - * - * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b = vqtbl2q_u8(samples, permute_tbl); + uint8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0]; + + *b = vreinterpretq_u8_u16(a0123); } static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, - uint8x16_t *b0, uint8x16_t *b1, - const uint8x16x2_t permute_tbl) { - /* Transpose 8-bit elements and concatenate result rows as follows: - * a0: 00, 01, 02, 03, 04, 05, 06, 07 - * a1: 10, 11, 12, 13, 14, 15, 16, 17 - * a2: 20, 21, 22, 23, 24, 25, 26, 27 - * a3: 30, 31, 32, 33, 34, 35, 36, 37 - * - * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 - * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 - * - * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it - * as an argument is preferable to loading it directly from memory as this - * inline helper is called many times from the same parent function. - */ - - uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; - *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); + uint8x16_t *b0, uint8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; + uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; + + uint16x8x2_t a0123 = + vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)); + + *b0 = vreinterpretq_u8_u16(a0123.val[0]); + *b1 = vreinterpretq_u8_u16(a0123.val[1]); } -static INLINE void vpx_convolve_4tap_vert_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { - const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x16x2_t samples_LUT; - - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - - load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - src += 7 * src_stride; - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - - do { - load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); - - d0 = convolve4_4_usdot_partial(s0123, filter); - d1 = convolve4_4_usdot_partial(s1234, filter); - d2 = convolve4_4_usdot_partial(s2345, filter); - d3 = convolve4_4_usdot_partial(s3456, filter); - /* We halved the filter values so -1 from right shift. */ - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - - store_u8(dst + 0 * dst_stride, dst_stride, d01); - store_u8(dst + 2 * dst_stride, dst_stride, d23); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s0123 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - s3456 = s78910; - - src += 4 * src_stride; - dst += 4 * dst_stride; - h -= 4; - } while (h != 0); - } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - - do { - height = h; - s = src; - d = dst; +static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); - load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - s += 7 * src_stride; - - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); - - do { - load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); - - d0 = convolve4_8_usdot_partial(s0123_lo, s0123_hi, filter); - d1 = convolve4_8_usdot_partial(s1234_lo, s1234_hi, filter); - d2 = convolve4_8_usdot_partial(s2345_lo, s2345_hi, filter); - d3 = convolve4_8_usdot_partial(s3456_lo, s3456_hi, filter); - - store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s0123_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - s3456_lo = s78910_lo; - - samples_LUT.val[0] = s3456_hi; - samples_LUT.val[1] = s78910_hi; - s0123_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - s3456_hi = s78910_hi; + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); +} - s += 4 * src_stride; - d += 4 * dst_stride; - height -= 4; - } while (height != 0); - src += 8; - dst += 8; - w -= 8; - } while (w != 0); - } +static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + + // First 4 output values. + int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); } -static INLINE void vpx_convolve_8tap_vert_neon_i8mm( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { +static INLINE void convolve_8tap_vert_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t filter) { const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x16x2_t samples_LUT; - if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; - + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + uint8x16_t s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456, s78910 } }; + uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_usdot_partial(s0123, s4567, filter); - d1 = convolve8_4_usdot_partial(s1234, s5678, filter); - d2 = convolve8_4_usdot_partial(s2345, s6789, filter); - d3 = convolve8_4_usdot_partial(s3456, s78910, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -712,67 +476,56 @@ static INLINE void vpx_convolve_8tap_vert_neon_i8mm( h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; - do { - height = h; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + uint8x16_t s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); samples_LUT.val[0] = s3456_hi; samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filter); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filter); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filter); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filter); + uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; @@ -798,8 +551,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); - assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); @@ -809,17 +560,15 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, (void)y_step_q4; if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { - /* All 4-tap and bilinear filter values are even, so halve them to reduce - * intermediate precision requirements. Also slide the filter values so the - * the 4 taps exist in the first 4 elements of the vector. - */ - const int8x8_t y_filter_4tap = - vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2); - vpx_convolve_4tap_vert_neon_i8mm(src - src_stride, src_stride, dst, - dst_stride, w, h, y_filter_4tap); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h, + y_filter); } else { - vpx_convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst, - dst_stride, w, h, y_filter_8tap); + const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter); } } @@ -830,8 +579,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, int w, int h) { const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); @@ -844,43 +591,40 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23, dd01, dd23; - + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456; + transpose_concat_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); - transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + uint8x16_t s78910; + transpose_concat_4x4(s7, s8, s9, s10, &s78910); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456; - samples_LUT.val[1] = s78910; - s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456, s78910 } }; + uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_usdot_partial(s0123, s4567, filters); - d1 = convolve8_4_usdot_partial(s1234, s5678, filters); - d2 = convolve8_4_usdot_partial(s2345, s6789, filters); - d3 = convolve8_4_usdot_partial(s3456, s78910, filters); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + int16x4_t d0 = convolve8_4_v(s0123, s4567, filters); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filters); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filters); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - dd01 = load_u8(dst + 0 * dst_stride, dst_stride); - dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); @@ -888,8 +632,8 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, store_u8(dst + 0 * dst_stride, dst_stride, d01); store_u8(dst + 2 * dst_stride, dst_stride, d23); - /* Prepare block for next iteration - re-using as much as possible. */ - /* Shuffle everything up four rows. */ + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; @@ -900,63 +644,53 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; - const uint8_t *s; - uint8_t *d; - int height; - do { - height = h; - s = src; - d = dst; + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - /* This operation combines a conventional transpose and the sample permute - * (see horizontal case) required before computing the dot product. - */ - transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, - tran_concat_tbl); - transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, - tran_concat_tbl); - transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, - tran_concat_tbl); - transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, - tran_concat_tbl); + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); - transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, - tran_concat_tbl); + uint8x16_t s78910_lo, s78910_hi; + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); - /* Merge new data into block from previous iteration. */ - samples_LUT.val[0] = s3456_lo; - samples_LUT.val[1] = s78910_lo; - s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); samples_LUT.val[0] = s3456_hi; samples_LUT.val[1] = s78910_hi; - s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); - s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); - s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filters); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filters); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filters); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filters); - + uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters); + + uint8x8_t dd0, dd1, dd2, dd3; load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); d0 = vrhadd_u8(d0, dd0); @@ -987,3 +721,275 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, } while (w != 0); } } + +static INLINE void convolve_4tap_2d_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t x_filter, + const uint8x8_t y_filter) { + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2), + vdup_lane_u8(y_filter, 3), + vdup_lane_u8(y_filter, 4), + vdup_lane_u8(y_filter, 5) }; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); + + int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1); + + src += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl); + int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl); + int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl); + int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1); + uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1); + uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4); + uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4); + + uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps); + uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps); + + store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01); + store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23); + + v_s01 = v_s45; + v_s12 = v_s56; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2); + + uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl); + uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl); + uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl); + + s += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl); + uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl); + uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl); + uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl); + + uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps); + uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps); + uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps); + uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void convolve_8tap_2d_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + uint8x16_t s0, s1, s2; + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = + vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2; + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + + const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8; + const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1; + const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride; + + if (x_filter_taps == 4 && y_filter_taps == 4) { + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t x_filter_4tap = + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + const uint8x8_t y_filter_4tap = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1); + + convolve_4tap_2d_neon_i8mm(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter_4tap, + y_filter_4tap); + return; + } + + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7). + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + const int im_height = h + SUBPEL_TAPS - 1; + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_2d_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride, + im_block, im_stride, w, im_height, + x_filter_8tap); + + convolve_8tap_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, w, h, + y_filter_8tap); +} + +void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + + // Averaging convolution always uses an 8-tap filter. + // Account for the vertical phase needing 3 lines prior and 4 lines post. + const int im_height = h + SUBPEL_TAPS - 1; + const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_2d_horiz_neon_i8mm(src - offset - offset * src_stride, + src_stride, im_block, im_stride, w, + im_height, x_filter_8tap); + + vpx_convolve8_avg_vert_neon_i8mm(im_block + offset * im_stride, im_stride, + dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c index 57772ea668..de5fa29471 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c @@ -19,31 +19,32 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the - * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). - */ - uint8_t temp[64 * 72]; + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7) (+1 row to make it divisible by 4). + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]); + const int im_stride = 64; const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; - /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior - * and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) */ - const int intermediate_height = h + vert_filter_taps; + // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior + // and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) + const int im_height = h + vert_filter_taps; const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* Filter starting border_offset lines back. The Neon implementation will - * ignore the given height and filter a multiple of 4 lines. Since this goes - * in to the temp buffer which has lots of extra room and is subsequently - * discarded this is safe if somewhat less than ideal. */ - vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, temp, - w, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, - intermediate_height); + // Filter starting border_offset rows back. The Neon implementation will + // ignore the given height and filter a multiple of 4 lines. Since this goes + // into the temporary buffer which has lots of extra room and is subsequently + // discarded this is safe if somewhat less than ideal. + vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, + im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, im_height); - /* Step into the temp buffer border_offset lines to get actual frame data. */ - vpx_convolve8_vert_neon(temp + w * border_offset, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + // Step into the temporary buffer border_offset rows to get actual frame data. + vpx_convolve8_vert_neon(im_block + im_stride * border_offset, im_stride, dst, + dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); } void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, @@ -51,18 +52,21 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - uint8_t temp[64 * 72]; - const int intermediate_height = h + 8; + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]); + const int im_stride = 64; + const int im_height = h + SUBPEL_TAPS; + const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* This implementation has the same issues as above. In addition, we only want - * to average the values after both passes. - */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, - intermediate_height); - vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, - x_step_q4, y0_q4, y_step_q4, w, h); + // This implementation has the same issues as above. In addition, we only want + // to average the values after both passes. + vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, + im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, im_height); + + vpx_convolve8_avg_vert_neon(im_block + im_stride * border_offset, im_stride, + dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c deleted file mode 100644 index 9d754fde17..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2023 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/arm/vpx_convolve8_neon.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/vpx_filter.h" -#include "vpx_ports/mem.h" - -void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the - * maximum buffer size to 64 * (64 + 7). */ - uint8_t temp[64 * 71]; - - const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; - /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior - * and vert_filter_taps / 2 lines post. */ - const int intermediate_height = h + vert_filter_taps - 1; - const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - vpx_convolve8_2d_horiz_neon_dotprod( - src - src_stride * border_offset, src_stride, temp, w, filter, x0_q4, - x_step_q4, y0_q4, y_step_q4, w, intermediate_height); - - vpx_convolve8_vert_neon_dotprod(temp + w * border_offset, w, dst, dst_stride, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, - h); -} - -void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, - int w, int h) { - uint8_t temp[64 * 71]; - - /* Averaging convolution always uses an 8-tap filter. */ - /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ - const int intermediate_height = h + 7; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w, - filter, x0_q4, x_step_q4, y0_q4, - y_step_q4, w, intermediate_height); - - vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); -} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c deleted file mode 100644 index d7cbb09ea6..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2023 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/arm/vpx_convolve8_neon.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/vpx_filter.h" -#include "vpx_ports/mem.h" - -void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the - * maximum buffer size to 64 * (64 + 7). */ - uint8_t temp[64 * 71]; - - const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; - /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior - * and vert_filter_taps / 2 lines post. */ - const int intermediate_height = h + vert_filter_taps - 1; - const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * border_offset, src_stride, - temp, w, filter, x0_q4, x_step_q4, y0_q4, - y_step_q4, w, intermediate_height); - - vpx_convolve8_vert_neon_i8mm(temp + w * border_offset, w, dst, dst_stride, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, - h); -} - -void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *filter, int x0_q4, - int x_step_q4, int y0_q4, int y_step_q4, int w, - int h) { - uint8_t temp[64 * 71]; - - /* Averaging convolution always uses an 8-tap filter. */ - /* Account for the vertical phase needing 3 lines prior and 4 lines post. */ - const int intermediate_height = h + 7; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w, - filter, x0_q4, x_step_q4, y0_q4, y_step_q4, - w, intermediate_height); - - vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, - x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); -} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h new file mode 100644 index 0000000000..bf9f18c7e6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_ +#define VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_ + +#include <arm_neon.h> +#include <arm_sve.h> +#include <arm_neon_sve_bridge.h> + +// Some very useful instructions are exclusive to the SVE2 instruction set. +// However, we can access these instructions from a predominantly Neon context +// by making use of the Neon-SVE bridge intrinsics to reinterpret Neon vectors +// as SVE vectors - with the high part of the SVE vector (if it's longer than +// 128 bits) being "don't care". + +static INLINE int16x8_t vpx_tbl2_s16(int16x8_t s0, int16x8_t s1, + uint16x8_t tbl) { + svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0), + svset_neonq_s16(svundef_s16(), s1)); + return svget_neonq_s16( + svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl))); +} + +#endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h new file mode 100644 index 0000000000..48534fb70e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_ +#define VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_ + +#include <arm_neon.h> +#include <arm_sve.h> +#include <arm_neon_sve_bridge.h> + +// Dot product instructions operating on 16-bit input elements are exclusive to +// the SVE instruction set. However, we can access these instructions from a +// predominantly Neon context by making use of the Neon-SVE bridge intrinsics +// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE +// vector (if it's longer than 128 bits) being "don't care". + +// While sub-optimal on machines that have SVE vector length > 128-bit - as the +// remainder of the vector is unused - this approach is still beneficial when +// compared to a Neon-only solution. + +static INLINE uint64x2_t vpx_dotq_u16(uint64x2_t acc, uint16x8_t x, + uint16x8_t y) { + return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc), + svset_neonq_u16(svundef_u16(), x), + svset_neonq_u16(svundef_u16(), y))); +} + +static INLINE int64x2_t vpx_dotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) { + return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc), + svset_neonq_s16(svundef_s16(), x), + svset_neonq_s16(svundef_s16(), y))); +} + +#define vpx_dotq_lane_s16(acc, x, y, lane) \ + svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), acc), \ + svset_neonq_s16(svundef_s16(), x), \ + svset_neonq_s16(svundef_s16(), y), lane)) + +static INLINE uint16x8_t vpx_tbl_u16(uint16x8_t data, uint16x8_t indices) { + return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), data), + svset_neonq_u16(svundef_u16(), indices))); +} + +#endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c index b8e3c5e540..9bd5ec285c 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c @@ -20,263 +20,271 @@ #include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_ports/mem.h" -static INLINE void scaledconvolve_horiz_w4( +static INLINE void scaledconvolve_horiz_neon( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, - const ptrdiff_t dst_stride, const InterpKernel *const x_filters, - const int x0_q4, const int x_step_q4, const int w, const int h) { - DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); - int x, y, z; + const ptrdiff_t dst_stride, const InterpKernel *const x_filter, + const int x0_q4, const int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); src -= SUBPEL_TAPS / 2 - 1; - y = h; - do { - int x_q4 = x0_q4; - x = 0; + if (w == 4) { do { - // process 4 src_x steps - for (z = 0; z < 4; ++z) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + int x_q4 = x0_q4; + + // Process a 4x4 tile. + for (int r = 0; r < 4; ++r) { + const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; + if (x_q4 & SUBPEL_MASK) { - const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); - uint8x8_t s[8], d; - int16x8_t ss[4]; - int16x4_t t[8], tt; - - load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); - transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]); - - ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); - ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); - ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); - ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); - t[0] = vget_low_s16(ss[0]); - t[1] = vget_low_s16(ss[1]); - t[2] = vget_low_s16(ss[2]); - t[3] = vget_low_s16(ss[3]); - t[4] = vget_high_s16(ss[0]); - t[5] = vget_high_s16(ss[1]); - t[6] = vget_high_s16(ss[2]); - t[7] = vget_high_s16(ss[3]); - - tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], - filters); - d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); - vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); + const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d0 = + vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS); + + store_u8_4x1(&temp[4 * r], d0); } else { - int i; - for (i = 0; i < 4; ++i) { - temp[z * 4 + i] = src_x[i * src_stride + 3]; + // Memcpy for non-subpel locations. + s += SUBPEL_TAPS / 2 - 1; + + for (int c = 0; c < 4; ++c) { + temp[r * 4 + c] = s[c * src_stride]; } } x_q4 += x_step_q4; } - // transpose the 4x4 filters values back to dst - { - const uint8x8x4_t d4 = vld4_u8(temp); - vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride], - vreinterpret_u32_u8(d4.val[0]), 0); - vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride], - vreinterpret_u32_u8(d4.val[1]), 0); - vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride], - vreinterpret_u32_u8(d4.val[2]), 0); - vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride], - vreinterpret_u32_u8(d4.val[3]), 0); - } - x += 4; - } while (x < w); + // Transpose the 4x4 result tile and store. + uint8x8_t d01 = vld1_u8(temp + 0); + uint8x8_t d23 = vld1_u8(temp + 8); - src += src_stride * 4; - dst += dst_stride * 4; - y -= 4; - } while (y > 0); -} + transpose_u8_4x4(&d01, &d23); -static INLINE void scaledconvolve_horiz_w8( - const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, - const ptrdiff_t dst_stride, const InterpKernel *const x_filters, - const int x0_q4, const int x_step_q4, const int w, const int h) { - DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); - int x, y, z; - src -= SUBPEL_TAPS / 2 - 1; + store_u8_4x1(dst + 0 * dst_stride, d01); + store_u8_4x1(dst + 1 * dst_stride, d23); + store_u8_4x1_high(dst + 2 * dst_stride, d01); + store_u8_4x1_high(dst + 3 * dst_stride, d23); - // This function processes 8x8 areas. The intermediate height is not always - // a multiple of 8, so force it to be a multiple of 8 here. - y = (h + 7) & ~7; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + return; + } do { int x_q4 = x0_q4; - x = 0; + uint8_t *d = dst; + int width = w; + do { - uint8x8_t d[8]; - // process 8 src_x steps - for (z = 0; z < 8; ++z) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + // Process an 8x8 tile. + for (int r = 0; r < 8; ++r) { + const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { - const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); - uint8x8_t s[8]; - load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], - &s[5], &s[6], &s[7]); - transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], - &s[7]); - d[0] = scale_filter_8(s, filters); - vst1_u8(&temp[8 * z], d[0]); + const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + + vst1_u8(&temp[r * 8], d0); } else { - int i; - for (i = 0; i < 8; ++i) { - temp[z * 8 + i] = src_x[i * src_stride + 3]; + // Memcpy for non-subpel locations. + s += SUBPEL_TAPS / 2 - 1; + + for (int c = 0; c < 8; ++c) { + temp[r * 8 + c] = s[c * src_stride]; } } x_q4 += x_step_q4; } - // transpose the 8x8 filters values back to dst - load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], - &d[7]); - transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); - vst1_u8(&dst[x + 0 * dst_stride], d[0]); - vst1_u8(&dst[x + 1 * dst_stride], d[1]); - vst1_u8(&dst[x + 2 * dst_stride], d[2]); - vst1_u8(&dst[x + 3 * dst_stride], d[3]); - vst1_u8(&dst[x + 4 * dst_stride], d[4]); - vst1_u8(&dst[x + 5 * dst_stride], d[5]); - vst1_u8(&dst[x + 6 * dst_stride], d[6]); - vst1_u8(&dst[x + 7 * dst_stride], d[7]); - x += 8; - } while (x < w); - - src += src_stride * 8; - dst += dst_stride * 8; - } while (y -= 8); -} + // Transpose the 8x8 result tile and store. + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; + load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); -static INLINE void scaledconvolve_vert_w4( - const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, - const ptrdiff_t dst_stride, const InterpKernel *const y_filters, - const int y0_q4, const int y_step_q4, const int w, const int h) { - int y; - int y_q4 = y0_q4; + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - y = h; - do { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - if (y_q4 & SUBPEL_MASK) { - const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); - uint8x8_t s[8], d; - int16x4_t t[8], tt; - - load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], - &s[6], &s[7]); - t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0]))); - t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1]))); - t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2]))); - t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3]))); - t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4]))); - t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5]))); - t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); - t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); - - tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); - d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); - } else { - memcpy(dst, &src_y[3 * src_stride], w); - } + d += 8; + width -= 8; + } while (width != 0); - dst += dst_stride; - y_q4 += y_step_q4; - } while (--y); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); } -static INLINE void scaledconvolve_vert_w8( +static INLINE void scaledconvolve_vert_neon( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, - const ptrdiff_t dst_stride, const InterpKernel *const y_filters, - const int y0_q4, const int y_step_q4, const int w, const int h) { - int y; + const ptrdiff_t dst_stride, const InterpKernel *const y_filter, + const int y0_q4, const int y_step_q4, int w, int h) { int y_q4 = y0_q4; - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - y = h; - do { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - if (y_q4 & SUBPEL_MASK) { - const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); - uint8x8_t s[8], d; - load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], - &s[6], &s[7]); - d = scale_filter_8(s, filters); - vst1_u8(dst, d); - } else { - memcpy(dst, &src_y[3 * src_stride], w); - } - dst += dst_stride; - y_q4 += y_step_q4; - } while (--y); -} + if (w == 4) { + do { + const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; -static INLINE void scaledconvolve_vert_w16( - const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, - const ptrdiff_t dst_stride, const InterpKernel *const y_filters, - const int y0_q4, const int y_step_q4, const int w, const int h) { - int x, y; - int y_q4 = y0_q4; + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + + int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d0 = + vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS); + + store_u8_4x1(dst, d0); + } else { + // Memcpy for non-subpel locations. + memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4); + } + + y_q4 += y_step_q4; + dst += dst_stride; + } while (--h != 0); + return; + } + + if (w == 8) { + do { + const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + + vst1_u8(dst, d0); + } else { + // Memcpy for non-subpel locations. + memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8); + } + + y_q4 += y_step_q4; + dst += dst_stride; + } while (--h != 0); + return; + } - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - y = h; do { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + uint8_t *d = dst; + int width = w; + if (y_q4 & SUBPEL_MASK) { - x = 0; do { - const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); - uint8x16_t ss[8]; - uint8x8_t s[8], d[2]; - load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4], - &ss[5], &ss[6], &ss[7]); - s[0] = vget_low_u8(ss[0]); - s[1] = vget_low_u8(ss[1]); - s[2] = vget_low_u8(ss[2]); - s[3] = vget_low_u8(ss[3]); - s[4] = vget_low_u8(ss[4]); - s[5] = vget_low_u8(ss[5]); - s[6] = vget_low_u8(ss[6]); - s[7] = vget_low_u8(ss[7]); - d[0] = scale_filter_8(s, filters); - - s[0] = vget_high_u8(ss[0]); - s[1] = vget_high_u8(ss[1]); - s[2] = vget_high_u8(ss[2]); - s[3] = vget_high_u8(ss[3]); - s[4] = vget_high_u8(ss[4]); - s[5] = vget_high_u8(ss[5]); - s[6] = vget_high_u8(ss[6]); - s[7] = vget_high_u8(ss[7]); - d[1] = scale_filter_8(s, filters); - vst1q_u8(&dst[x], vcombine_u8(d[0], d[1])); - src_y += 16; - x += 16; - } while (x < w); + const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]); + + uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); + s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); + s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); + s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); + s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4))); + s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5))); + s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6))); + s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7))); + + s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); + s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); + s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); + s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); + s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4))); + s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5))); + s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6))); + s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7))); + + uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0], + s6[0], s7[0], filter); + uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1], + s6[1], s7[1], filter); + + vst1q_u8(d, vcombine_u8(d0, d1)); + + s += 16; + d += 16; + width -= 16; + } while (width != 0); } else { - memcpy(dst, &src_y[3 * src_stride], w); + // Memcpy for non-subpel locations. + s += (SUBPEL_TAPS / 2 - 1) * src_stride; + + do { + uint8x16_t s0 = vld1q_u8(s); + vst1q_u8(d, s0); + s += 16; + d += 16; + width -= 16; + } while (width != 0); } - dst += dst_stride; + y_q4 += y_step_q4; - } while (--y); + dst += dst_stride; + } while (--h != 0); } void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // Fixed size intermediate buffer, im_block, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): + // Deriving the maximum number of rows in the im_block buffer (135): // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). // --Largest block size is 64x64 pixels. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the @@ -288,33 +296,20 @@ void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, // When calling in frame scaling function, the smallest scaling factor is x1/4 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still // big enough. - DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); - const int intermediate_height = + DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]); + const int im_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + const ptrdiff_t im_stride = 64; assert(w <= 64); assert(h <= 64); assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); assert(x_step_q4 <= 64); - if (w >= 8) { - scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, filter, x0_q4, x_step_q4, w, - intermediate_height); - } else { - scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, filter, x0_q4, x_step_q4, w, - intermediate_height); - } + scaledconvolve_horiz_neon(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, im_block, im_stride, filter, x0_q4, + x_step_q4, w, im_height); - if (w >= 16) { - scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, filter, y0_q4, y_step_q4, w, h); - } else if (w == 8) { - scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, filter, y0_q4, y_step_q4, w, h); - } else { - scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, filter, y0_q4, y_step_q4, w, h); - } + scaledconvolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); } diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk index 2bee91f449..916dc62cef 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk @@ -112,7 +112,8 @@ DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c +DSP_SRCS-$(HAVE_SVE) += arm/highbd_vpx_convolve8_sve.c +DSP_SRCS-$(HAVE_SVE2) += arm/highbd_vpx_convolve8_sve2.c endif DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm @@ -139,9 +140,7 @@ DSP_SRCS-yes += arm/vpx_convolve8_neon.c DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c DSP_SRCS-yes += arm/vpx_convolve_neon.c DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c -DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c -DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c endif # HAVE_NEON endif # HAVE_NEON_ASM @@ -374,6 +373,7 @@ DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c DSP_SRCS-yes += sum_squares.c DSP_SRCS-$(HAVE_NEON) += arm/sum_squares_neon.c +DSP_SRCS-$(HAVE_SVE) += arm/sum_squares_sve.c DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c @@ -454,6 +454,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_sse_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/highbd_variance_neon_dotprod.c +DSP_SRCS-$(HAVE_SVE) += arm/highbd_variance_sve.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c index 030c456d39..2b8c656afb 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c @@ -12,4 +12,4 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/vpx_once.h" -void vpx_dsp_rtcd() { once(setup_rtcd_internal); } +void vpx_dsp_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index 18087e25d9..f40f85c036 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -427,19 +427,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_horiz avx2 neon sve/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_vert avx2 neon sve2/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon sve/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd"; - specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon sve2/, "$sse2_x86_64"; } # CONFIG_VP9_HIGHBITDEPTH if (vpx_config("CONFIG_VP9") eq "yes") { @@ -1009,7 +1009,7 @@ add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, specialize qw/vpx_sad_skip_4x4x4d neon/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; -specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/; +specialize qw/vpx_sum_squares_2d_i16 neon sve sse2 msa/; # # Structured Similarity (SSIM) @@ -1411,163 +1411,163 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance64x64 sse2 neon/; + specialize qw/vpx_highbd_12_variance64x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance64x32 sse2 neon/; + specialize qw/vpx_highbd_12_variance64x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x64 sse2 neon/; + specialize qw/vpx_highbd_12_variance32x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x32 sse2 neon/; + specialize qw/vpx_highbd_12_variance32x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x16 sse2 neon/; + specialize qw/vpx_highbd_12_variance32x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x32 sse2 neon/; + specialize qw/vpx_highbd_12_variance16x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x16 sse2 neon/; + specialize qw/vpx_highbd_12_variance16x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x8 sse2 neon/; + specialize qw/vpx_highbd_12_variance16x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x16 sse2 neon/; + specialize qw/vpx_highbd_12_variance8x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x8 sse2 neon/; + specialize qw/vpx_highbd_12_variance8x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x4 neon/; + specialize qw/vpx_highbd_12_variance8x4 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance4x8 neon/; + specialize qw/vpx_highbd_12_variance4x8 neon sve/; add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance4x4 neon/; + specialize qw/vpx_highbd_12_variance4x4 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance64x64 sse2 neon/; + specialize qw/vpx_highbd_10_variance64x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance64x32 sse2 neon/; + specialize qw/vpx_highbd_10_variance64x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x64 sse2 neon/; + specialize qw/vpx_highbd_10_variance32x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x32 sse2 neon/; + specialize qw/vpx_highbd_10_variance32x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x16 sse2 neon/; + specialize qw/vpx_highbd_10_variance32x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x32 sse2 neon/; + specialize qw/vpx_highbd_10_variance16x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x16 sse2 neon/; + specialize qw/vpx_highbd_10_variance16x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x8 sse2 neon/; + specialize qw/vpx_highbd_10_variance16x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x16 sse2 neon/; + specialize qw/vpx_highbd_10_variance8x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x8 sse2 neon/; + specialize qw/vpx_highbd_10_variance8x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x4 neon/; + specialize qw/vpx_highbd_10_variance8x4 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance4x8 neon/; + specialize qw/vpx_highbd_10_variance4x8 neon sve/; add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance4x4 neon/; + specialize qw/vpx_highbd_10_variance4x4 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance64x64 sse2 neon/; + specialize qw/vpx_highbd_8_variance64x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance64x32 sse2 neon/; + specialize qw/vpx_highbd_8_variance64x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x64 sse2 neon/; + specialize qw/vpx_highbd_8_variance32x64 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x32 sse2 neon/; + specialize qw/vpx_highbd_8_variance32x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x16 sse2 neon/; + specialize qw/vpx_highbd_8_variance32x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x32 sse2 neon/; + specialize qw/vpx_highbd_8_variance16x32 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x16 sse2 neon/; + specialize qw/vpx_highbd_8_variance16x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x8 sse2 neon/; + specialize qw/vpx_highbd_8_variance16x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x16 sse2 neon/; + specialize qw/vpx_highbd_8_variance8x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x8 sse2 neon/; + specialize qw/vpx_highbd_8_variance8x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x4 neon/; + specialize qw/vpx_highbd_8_variance8x4 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance4x8 neon/; + specialize qw/vpx_highbd_8_variance4x8 neon sve/; add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance4x4 neon/; + specialize qw/vpx_highbd_8_variance4x4 neon sve/; add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_8_get16x16var sse2 neon/; + specialize qw/vpx_highbd_8_get16x16var sse2 neon sve/; add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_8_get8x8var sse2 neon/; + specialize qw/vpx_highbd_8_get8x8var sse2 neon sve/; add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_10_get16x16var sse2 neon/; + specialize qw/vpx_highbd_10_get16x16var sse2 neon sve/; add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_10_get8x8var sse2 neon/; + specialize qw/vpx_highbd_10_get8x8var sse2 neon sve/; add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_12_get16x16var sse2 neon/; + specialize qw/vpx_highbd_12_get16x16var sse2 neon sve/; add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_12_get8x8var sse2 neon/; + specialize qw/vpx_highbd_12_get8x8var sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x16 sse2 neon/; + specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x8 neon/; + specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x16 neon/; + specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x8 sse2 neon/; + specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/; add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse16x16 sse2 neon/; + specialize qw/vpx_highbd_10_mse16x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse16x8 neon/; + specialize qw/vpx_highbd_10_mse16x8 neon sve/; add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse8x16 neon/; + specialize qw/vpx_highbd_10_mse8x16 neon sve/; add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse8x8 sse2 neon/; + specialize qw/vpx_highbd_10_mse8x8 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse16x16 sse2 neon/; + specialize qw/vpx_highbd_12_mse16x16 sse2 neon sve/; add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse16x8 neon/; + specialize qw/vpx_highbd_12_mse16x8 neon sve/; add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse8x16 neon/; + specialize qw/vpx_highbd_12_mse8x16 neon sve/; add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse8x8 sse2 neon/; + specialize qw/vpx_highbd_12_mse8x8 sse2 neon sve/; add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride"; specialize qw/vpx_highbd_comp_avg_pred neon sse2/; diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h index 0cddcb6991..eb8ff06cd7 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h +++ b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h @@ -28,7 +28,6 @@ extern "C" { typedef int16_t InterpKernel[SUBPEL_TAPS]; static INLINE int vpx_get_filter_taps(const int16_t *const filter) { - assert(filter[3] != 128); if (filter[0] | filter[7]) { return 8; } diff --git a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c index 539d09bb39..eba12d312a 100644 --- a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c +++ b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c @@ -15,7 +15,7 @@ #include <sys/sysctl.h> #endif -#if !CONFIG_RUNTIME_CPU_DETECT +#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__) static int arm_get_cpu_caps(void) { // This function should actually be a no-op. There is no way to adjust any of @@ -28,7 +28,7 @@ static int arm_get_cpu_caps(void) { return flags; } -#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT +#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__) // sysctlbyname() parameter documentation for instruction set characteristics: // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics @@ -99,14 +99,17 @@ static int arm_get_cpu_caps(void) { // hwcap values are not defined should not prevent features from being enabled. #define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20) #define VPX_AARCH64_HWCAP_SVE (1 << 22) +#define VPX_AARCH64_HWCAP2_SVE2 (1 << 1) #define VPX_AARCH64_HWCAP2_I8MM (1 << 13) static int arm_get_cpu_caps(void) { int flags = 0; +#if HAVE_NEON_DOTPROD || HAVE_SVE unsigned long hwcap = getauxval(AT_HWCAP); -#if HAVE_NEON_I8MM +#endif // HAVE_NEON_DOTPROD || HAVE_SVE +#if HAVE_NEON_I8MM || HAVE_SVE2 unsigned long hwcap2 = getauxval(AT_HWCAP2); -#endif // HAVE_NEON_I8MM +#endif // HAVE_NEON_I8MM || HAVE_SVE2 #if HAVE_NEON flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. #endif // HAVE_NEON @@ -125,6 +128,11 @@ static int arm_get_cpu_caps(void) { flags |= HAS_SVE; } #endif // HAVE_SVE +#if HAVE_SVE2 + if (hwcap2 & VPX_AARCH64_HWCAP2_SVE2) { + flags |= HAS_SVE2; + } +#endif // HAVE_SVE2 return flags; } @@ -195,5 +203,10 @@ int arm_cpu_caps(void) { flags &= ~HAS_SVE; } + // Restrict flags: FEAT_SVE2 assumes that FEAT_SVE is available. + if (!(flags & HAS_SVE)) { + flags &= ~HAS_SVE2; + } + return flags; } diff --git a/media/libvpx/libvpx/vpx_ports/arm.h b/media/libvpx/libvpx/vpx_ports/arm.h index 39365d18ee..814c3cc408 100644 --- a/media/libvpx/libvpx/vpx_ports/arm.h +++ b/media/libvpx/libvpx/vpx_ports/arm.h @@ -25,6 +25,8 @@ extern "C" { #define HAS_NEON_I8MM (1 << 2) // Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A. #define HAS_SVE (1 << 3) +// Armv9.0-A SVE2 instructions. +#define HAS_SVE2 (1 << 4) int arm_cpu_caps(void); diff --git a/media/libvpx/libvpx/vpx_ports/emms_mmx.c b/media/libvpx/libvpx/vpx_ports/emms_mmx.c index f1036b98ed..79b98a75f1 100644 --- a/media/libvpx/libvpx/vpx_ports/emms_mmx.c +++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.c @@ -12,4 +12,4 @@ #include "vpx_ports/system_state.h" -void vpx_clear_system_state() { _mm_empty(); } +void vpx_clear_system_state(void) { _mm_empty(); } diff --git a/media/libvpx/libvpx/vpx_ports/mem.h b/media/libvpx/libvpx/vpx_ports/mem.h index 5eccfe8f50..ee9e095633 100644 --- a/media/libvpx/libvpx/vpx_ports/mem.h +++ b/media/libvpx/libvpx/vpx_ports/mem.h @@ -23,7 +23,13 @@ #define DECLARE_ALIGNED(n, typ, val) typ val #endif -#if HAVE_NEON && defined(_MSC_VER) +#if defined(__has_builtin) +#define VPX_HAS_BUILTIN(x) __has_builtin(x) +#else +#define VPX_HAS_BUILTIN(x) 0 +#endif + +#if !VPX_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__) #define __builtin_prefetch(x) #endif diff --git a/media/libvpx/libvpx/vpx_ports/vpx_once.h b/media/libvpx/libvpx/vpx_ports/vpx_once.h index d8a8ed89fe..d33eff4397 100644 --- a/media/libvpx/libvpx/vpx_ports/vpx_once.h +++ b/media/libvpx/libvpx/vpx_ports/vpx_once.h @@ -91,29 +91,6 @@ static void once(void (*func)(void)) { return; } -#elif CONFIG_MULTITHREAD && defined(__OS2__) -#define INCL_DOS -#include <os2.h> -static void once(void (*func)(void)) { - static volatile int done; - - /* If the initialization is complete, return early. */ - if (done) return; - - /* Causes all other threads in the process to block themselves - * and give up their time slice. - */ - DosEnterCritSec(); - - if (!done) { - func(); - done = 1; - } - - /* Restores normal thread dispatching for the current process. */ - DosExitCritSec(); -} - #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H #include <pthread.h> static void once(void (*func)(void)) { diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c index dc4d9593a8..706b0770c8 100644 --- a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c +++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c @@ -12,4 +12,4 @@ #include "./vpx_scale_rtcd.h" #include "vpx_ports/vpx_once.h" -void vpx_scale_rtcd() { once(setup_rtcd_internal); } +void vpx_scale_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vpx_util/vpx_pthread.h b/media/libvpx/libvpx/vpx_util/vpx_pthread.h new file mode 100644 index 0000000000..cdd18d0f30 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_pthread.h @@ -0,0 +1,157 @@ +// Copyright 2024 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// pthread.h wrapper + +#ifndef VPX_VPX_UTIL_VPX_PTHREAD_H_ +#define VPX_VPX_UTIL_VPX_PTHREAD_H_ + +#include "./vpx_config.h" + +#if CONFIG_MULTITHREAD + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) && !HAVE_PTHREAD_H +// Prevent leaking max/min macros. +#undef NOMINMAX +#define NOMINMAX +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#include <errno.h> // NOLINT +#include <process.h> // NOLINT +#include <stddef.h> // NOLINT +#include <windows.h> // NOLINT +typedef HANDLE pthread_t; +typedef CRITICAL_SECTION pthread_mutex_t; + +#if _WIN32_WINNT < 0x0600 +#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. +#endif +typedef CONDITION_VARIABLE pthread_cond_t; + +#ifndef WINAPI_FAMILY_PARTITION +#define WINAPI_PARTITION_DESKTOP 1 +#define WINAPI_FAMILY_PARTITION(x) x +#endif + +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define USE_CREATE_THREAD +#endif + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#if defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall +#else +#define THREADFN unsigned int __stdcall +#endif +#define THREAD_EXIT_SUCCESS 0 + +static INLINE int pthread_create(pthread_t *const thread, const void *attr, + unsigned int(__stdcall *start)(void *), + void *arg) { + (void)attr; +#ifdef USE_CREATE_THREAD + *thread = CreateThread(NULL, /* lpThreadAttributes */ + 0, /* dwStackSize */ + start, arg, 0, /* dwStackSize */ + NULL); /* lpThreadId */ +#else + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, arg, 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ +#endif + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void **value_ptr) { + (void)value_ptr; + return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) != + WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void *mutexattr) { + (void)mutexattr; + InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); + return 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return TryEnterCriticalSection(mutex) ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + (void)condition; + return 0; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void *cond_attr) { + (void)cond_attr; + InitializeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + WakeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + WakeAllConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok; + ok = SleepConditionVariableCS(condition, mutex, INFINITE); + return !ok; +} +#else // _WIN32 +#include <pthread.h> // NOLINT +#define THREADFN void * +#define THREAD_EXIT_SUCCESS NULL +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // CONFIG_MULTITHREAD + +#endif // VPX_VPX_UTIL_VPX_PTHREAD_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.c b/media/libvpx/libvpx/vpx_util/vpx_thread.c index 04c5fb6f26..0d0e2f5766 100644 --- a/media/libvpx/libvpx/vpx_util/vpx_thread.c +++ b/media/libvpx/libvpx/vpx_util/vpx_thread.c @@ -12,10 +12,18 @@ // Original source: // https://chromium.googlesource.com/webm/libwebp +// Enable GNU extensions in glibc so that we can call pthread_setname_np(). +// This must be before any #include statements. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include <assert.h> #include <string.h> // for memset() +#include "./vpx_config.h" #include "./vpx_thread.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_pthread.h" #if CONFIG_MULTITHREAD @@ -31,23 +39,54 @@ static void execute(VPxWorker *const worker); // Forward declaration. static THREADFN thread_loop(void *ptr) { VPxWorker *const worker = (VPxWorker *)ptr; - int done = 0; - while (!done) { - pthread_mutex_lock(&worker->impl_->mutex_); - while (worker->status_ == OK) { // wait in idling mode +#ifdef __APPLE__ + if (worker->thread_name != NULL) { + // Apple's version of pthread_setname_np takes one argument and operates on + // the current thread only. The maximum size of the thread_name buffer was + // noted in the Chromium source code and was confirmed by experiments. If + // thread_name is too long, pthread_setname_np returns -1 with errno + // ENAMETOOLONG (63). + char thread_name[64]; + strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); + thread_name[sizeof(thread_name) - 1] = '\0'; + pthread_setname_np(thread_name); + } +#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__) + if (worker->thread_name != NULL) { + // Linux and Android require names (with nul) fit in 16 chars, otherwise + // pthread_setname_np() returns ERANGE (34). + char thread_name[16]; + strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); + thread_name[sizeof(thread_name) - 1] = '\0'; + pthread_setname_np(pthread_self(), thread_name); + } +#endif + pthread_mutex_lock(&worker->impl_->mutex_); + for (;;) { + while (worker->status_ == VPX_WORKER_STATUS_OK) { // wait in idling mode pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } - if (worker->status_ == WORK) { + if (worker->status_ == VPX_WORKER_STATUS_WORKING) { + // When worker->status_ is VPX_WORKER_STATUS_WORKING, the main thread + // doesn't change worker->status_ and will wait until the worker changes + // worker->status_ to VPX_WORKER_STATUS_OK. See change_state(). So the + // worker can safely call execute() without holding worker->impl_->mutex_. + // When the worker reacquires worker->impl_->mutex_, worker->status_ must + // still be VPX_WORKER_STATUS_WORKING. + pthread_mutex_unlock(&worker->impl_->mutex_); execute(worker); - worker->status_ = OK; - } else if (worker->status_ == NOT_OK) { // finish the worker - done = 1; + pthread_mutex_lock(&worker->impl_->mutex_); + assert(worker->status_ == VPX_WORKER_STATUS_WORKING); + worker->status_ = VPX_WORKER_STATUS_OK; + // signal to the main thread that we're done (for sync()) + pthread_cond_signal(&worker->impl_->condition_); + } else { + assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK); // finish the worker + break; } - // signal to the main thread that we're done (for sync()) - pthread_cond_signal(&worker->impl_->condition_); - pthread_mutex_unlock(&worker->impl_->mutex_); } - return THREAD_RETURN(NULL); // Thread is finished + pthread_mutex_unlock(&worker->impl_->mutex_); + return THREAD_EXIT_SUCCESS; // Thread is finished } // main thread state control @@ -58,13 +97,13 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) { if (worker->impl_ == NULL) return; pthread_mutex_lock(&worker->impl_->mutex_); - if (worker->status_ >= OK) { + if (worker->status_ >= VPX_WORKER_STATUS_OK) { // wait for the worker to finish - while (worker->status_ != OK) { + while (worker->status_ != VPX_WORKER_STATUS_OK) { pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } // assign new status and release the working thread if needed - if (new_status != OK) { + if (new_status != VPX_WORKER_STATUS_OK) { worker->status_ = new_status; pthread_cond_signal(&worker->impl_->condition_); } @@ -78,21 +117,21 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) { static void init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); - worker->status_ = NOT_OK; + worker->status_ = VPX_WORKER_STATUS_NOT_OK; } static int sync(VPxWorker *const worker) { #if CONFIG_MULTITHREAD - change_state(worker, OK); + change_state(worker, VPX_WORKER_STATUS_OK); #endif - assert(worker->status_ <= OK); + assert(worker->status_ <= VPX_WORKER_STATUS_OK); return !worker->had_error; } static int reset(VPxWorker *const worker) { int ok = 1; worker->had_error = 0; - if (worker->status_ < OK) { + if (worker->status_ < VPX_WORKER_STATUS_OK) { #if CONFIG_MULTITHREAD worker->impl_ = (VPxWorkerImpl *)vpx_calloc(1, sizeof(*worker->impl_)); if (worker->impl_ == NULL) { @@ -107,7 +146,7 @@ static int reset(VPxWorker *const worker) { } pthread_mutex_lock(&worker->impl_->mutex_); ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker); - if (ok) worker->status_ = OK; + if (ok) worker->status_ = VPX_WORKER_STATUS_OK; pthread_mutex_unlock(&worker->impl_->mutex_); if (!ok) { pthread_mutex_destroy(&worker->impl_->mutex_); @@ -118,12 +157,12 @@ static int reset(VPxWorker *const worker) { return 0; } #else - worker->status_ = OK; + worker->status_ = VPX_WORKER_STATUS_OK; #endif - } else if (worker->status_ > OK) { + } else if (worker->status_ > VPX_WORKER_STATUS_OK) { ok = sync(worker); } - assert(!ok || (worker->status_ == OK)); + assert(!ok || (worker->status_ == VPX_WORKER_STATUS_OK)); return ok; } @@ -135,7 +174,7 @@ static void execute(VPxWorker *const worker) { static void launch(VPxWorker *const worker) { #if CONFIG_MULTITHREAD - change_state(worker, WORK); + change_state(worker, VPX_WORKER_STATUS_WORKING); #else execute(worker); #endif @@ -144,7 +183,7 @@ static void launch(VPxWorker *const worker) { static void end(VPxWorker *const worker) { #if CONFIG_MULTITHREAD if (worker->impl_ != NULL) { - change_state(worker, NOT_OK); + change_state(worker, VPX_WORKER_STATUS_NOT_OK); pthread_join(worker->impl_->thread_, NULL); pthread_mutex_destroy(&worker->impl_->mutex_); pthread_cond_destroy(&worker->impl_->condition_); @@ -152,10 +191,10 @@ static void end(VPxWorker *const worker) { worker->impl_ = NULL; } #else - worker->status_ = NOT_OK; + worker->status_ = VPX_WORKER_STATUS_NOT_OK; assert(worker->impl_ == NULL); #endif - assert(worker->status_ == NOT_OK); + assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK); } //------------------------------------------------------------------------------ diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.h b/media/libvpx/libvpx/vpx_util/vpx_thread.h index 6d308e949b..11a1d74387 100644 --- a/media/libvpx/libvpx/vpx_util/vpx_thread.h +++ b/media/libvpx/libvpx/vpx_util/vpx_thread.h @@ -15,370 +15,22 @@ #ifndef VPX_VPX_UTIL_VPX_THREAD_H_ #define VPX_VPX_UTIL_VPX_THREAD_H_ -#include "./vpx_config.h" - #ifdef __cplusplus extern "C" { #endif -// Set maximum decode threads to be 8 due to the limit of frame buffers -// and not enough semaphores in the emulation layer on windows. -#define MAX_DECODE_THREADS 8 - -#if CONFIG_MULTITHREAD - -#if defined(_WIN32) && !HAVE_PTHREAD_H -#include <errno.h> // NOLINT -#include <process.h> // NOLINT -#include <windows.h> // NOLINT -typedef HANDLE pthread_t; -typedef CRITICAL_SECTION pthread_mutex_t; - -#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater -#define USE_WINDOWS_CONDITION_VARIABLE -typedef CONDITION_VARIABLE pthread_cond_t; -#else -typedef struct { - HANDLE waiting_sem_; - HANDLE received_sem_; - HANDLE signal_event_; -} pthread_cond_t; -#endif // _WIN32_WINNT >= 0x600 - -#ifndef WINAPI_FAMILY_PARTITION -#define WINAPI_PARTITION_DESKTOP 1 -#define WINAPI_FAMILY_PARTITION(x) x -#endif - -#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -#define USE_CREATE_THREAD -#endif - -//------------------------------------------------------------------------------ -// simplistic pthread emulation layer - -// _beginthreadex requires __stdcall -#if defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) -#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall -#else -#define THREADFN unsigned int __stdcall -#endif -#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) - -#if _WIN32_WINNT >= 0x0501 // Windows XP or greater -#define WaitForSingleObject(obj, timeout) \ - WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/) -#endif - -static INLINE int pthread_create(pthread_t *const thread, const void *attr, - unsigned int(__stdcall *start)(void *), - void *arg) { - (void)attr; -#ifdef USE_CREATE_THREAD - *thread = CreateThread(NULL, /* lpThreadAttributes */ - 0, /* dwStackSize */ - start, arg, 0, /* dwStackSize */ - NULL); /* lpThreadId */ -#else - *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ - 0, /* unsigned stack_size */ - start, arg, 0, /* unsigned initflag */ - NULL); /* unsigned *thrdaddr */ -#endif - if (*thread == NULL) return 1; - SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); - return 0; -} - -static INLINE int pthread_join(pthread_t thread, void **value_ptr) { - (void)value_ptr; - return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 || - CloseHandle(thread) == 0); -} - -// Mutex -static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, - void *mutexattr) { - (void)mutexattr; -#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater - InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); -#else - InitializeCriticalSection(mutex); -#endif - return 0; -} - -static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { - return TryEnterCriticalSection(mutex) ? 0 : EBUSY; -} - -static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { - EnterCriticalSection(mutex); - return 0; -} - -static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { - LeaveCriticalSection(mutex); - return 0; -} - -static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { - DeleteCriticalSection(mutex); - return 0; -} - -// Condition -static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { - int ok = 1; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - (void)condition; -#else - ok &= (CloseHandle(condition->waiting_sem_) != 0); - ok &= (CloseHandle(condition->received_sem_) != 0); - ok &= (CloseHandle(condition->signal_event_) != 0); -#endif - return !ok; -} - -static INLINE int pthread_cond_init(pthread_cond_t *const condition, - void *cond_attr) { - (void)cond_attr; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - InitializeConditionVariable(condition); -#else - condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL); - condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL); - condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); - if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL || - condition->signal_event_ == NULL) { - pthread_cond_destroy(condition); - return 1; - } -#endif - return 0; -} - -static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { - int ok = 1; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - WakeAllConditionVariable(condition); -#else - while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { - // a thread is waiting in pthread_cond_wait: allow it to be notified - ok &= SetEvent(condition->signal_event_); - // wait until the event is consumed so the signaler cannot consume - // the event via its own pthread_cond_wait. - ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != - WAIT_OBJECT_0); - } -#endif - return !ok; -} - -static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { - int ok = 1; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - WakeConditionVariable(condition); -#else - if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { - // a thread is waiting in pthread_cond_wait: allow it to be notified - ok = SetEvent(condition->signal_event_); - // wait until the event is consumed so the signaler cannot consume - // the event via its own pthread_cond_wait. - ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != - WAIT_OBJECT_0); - } -#endif - return !ok; -} - -static INLINE int pthread_cond_wait(pthread_cond_t *const condition, - pthread_mutex_t *const mutex) { - int ok; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - ok = SleepConditionVariableCS(condition, mutex, INFINITE); -#else - // note that there is a consumer available so the signal isn't dropped in - // pthread_cond_signal - if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1; - // now unlock the mutex so pthread_cond_signal may be issued - pthread_mutex_unlock(mutex); - ok = (WaitForSingleObject(condition->signal_event_, INFINITE) == - WAIT_OBJECT_0); - ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL); - pthread_mutex_lock(mutex); -#endif - return !ok; -} - -#elif defined(__OS2__) -#define INCL_DOS -#include <os2.h> // NOLINT - -#include <errno.h> // NOLINT -#include <stdlib.h> // NOLINT -#include <sys/builtin.h> // NOLINT - -#if defined(__STRICT_ANSI__) -// _beginthread() is not declared on __STRICT_ANSI__ mode. Declare here. -int _beginthread(void (*)(void *), void *, unsigned, void *); -#endif - -#define pthread_t TID -#define pthread_mutex_t HMTX - -typedef struct { - HEV event_sem_; - HEV ack_sem_; - volatile unsigned wait_count_; -} pthread_cond_t; - -//------------------------------------------------------------------------------ -// simplistic pthread emulation layer - -#define THREADFN void * -#define THREAD_RETURN(val) (val) - -typedef struct { - void *(*start_)(void *); - void *arg_; -} thread_arg; - -static void thread_start(void *arg) { - thread_arg targ = *(thread_arg *)arg; - free(arg); - - targ.start_(targ.arg_); -} - -static INLINE int pthread_create(pthread_t *const thread, const void *attr, - void *(*start)(void *), void *arg) { - int tid; - thread_arg *targ = (thread_arg *)malloc(sizeof(*targ)); - if (targ == NULL) return 1; - - (void)attr; - - targ->start_ = start; - targ->arg_ = arg; - tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ); - if (tid == -1) { - free(targ); - return 1; - } - - *thread = tid; - return 0; -} - -static INLINE int pthread_join(pthread_t thread, void **value_ptr) { - (void)value_ptr; - return DosWaitThread(&thread, DCWW_WAIT) != 0; -} - -// Mutex -static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, - void *mutexattr) { - (void)mutexattr; - return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0; -} - -static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { - return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY; -} - -static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { - return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0; -} - -static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { - return DosReleaseMutexSem(*mutex) != 0; -} - -static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { - return DosCloseMutexSem(*mutex) != 0; -} - -// Condition -static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { - int ok = 1; - ok &= DosCloseEventSem(condition->event_sem_) == 0; - ok &= DosCloseEventSem(condition->ack_sem_) == 0; - return !ok; -} - -static INLINE int pthread_cond_init(pthread_cond_t *const condition, - void *cond_attr) { - int ok = 1; - (void)cond_attr; - - ok &= - DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0; - ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0; - if (!ok) { - pthread_cond_destroy(condition); - return 1; - } - condition->wait_count_ = 0; - return 0; -} - -static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { - int ok = 1; - - if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) { - ok &= DosPostEventSem(condition->event_sem_) == 0; - ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0; - } - - return !ok; -} - -static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { - int ok = 1; - - while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) - ok &= pthread_cond_signal(condition) == 0; - - return !ok; -} - -static INLINE int pthread_cond_wait(pthread_cond_t *const condition, - pthread_mutex_t *const mutex) { - int ok = 1; - - __atomic_increment(&condition->wait_count_); - - ok &= pthread_mutex_unlock(mutex) == 0; - - ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0; - - __atomic_decrement(&condition->wait_count_); - - ok &= DosPostEventSem(condition->ack_sem_) == 0; - - pthread_mutex_lock(mutex); - - return !ok; -} -#else // _WIN32 -#include <pthread.h> // NOLINT -#define THREADFN void * -#define THREAD_RETURN(val) val -#endif - -#endif // CONFIG_MULTITHREAD +#define MAX_NUM_THREADS 64 // State of the worker thread object typedef enum { - NOT_OK = 0, // object is unusable - OK, // ready to work - WORK // busy finishing the current task + VPX_WORKER_STATUS_NOT_OK = 0, // object is unusable + VPX_WORKER_STATUS_OK, // ready to work + VPX_WORKER_STATUS_WORKING // busy finishing the current task } VPxWorkerStatus; // Function to be called by the worker thread. Takes two opaque pointers as -// arguments (data1 and data2), and should return false in case of error. +// arguments (data1 and data2). Should return true on success and return false +// in case of error. typedef int (*VPxWorkerHook)(void *, void *); // Platform-dependent implementation details for the worker. @@ -388,10 +40,14 @@ typedef struct VPxWorkerImpl VPxWorkerImpl; typedef struct { VPxWorkerImpl *impl_; VPxWorkerStatus status_; + // Thread name for the debugger. If not NULL, must point to a string that + // outlives the worker thread. For portability, use a name <= 15 characters + // long (not including the terminating NUL character). + const char *thread_name; VPxWorkerHook hook; // hook to call void *data1; // first argument passed to 'hook' void *data2; // second argument passed to 'hook' - int had_error; // return value of the last call to 'hook' + int had_error; // true if a call to 'hook' returned false } VPxWorker; // The interface for all thread-worker related functions. All these functions diff --git a/media/libvpx/libvpx/vpx_util/vpx_util.mk b/media/libvpx/libvpx/vpx_util/vpx_util.mk index 1162714956..948e6d6f89 100644 --- a/media/libvpx/libvpx/vpx_util/vpx_util.mk +++ b/media/libvpx/libvpx/vpx_util/vpx_util.mk @@ -10,6 +10,7 @@ UTIL_SRCS-yes += vpx_atomics.h UTIL_SRCS-yes += vpx_util.mk +UTIL_SRCS-yes += vpx_pthread.h UTIL_SRCS-yes += vpx_thread.c UTIL_SRCS-yes += vpx_thread.h UTIL_SRCS-yes += endian_inl.h diff --git a/media/libvpx/missing_header.patch b/media/libvpx/missing_header.patch new file mode 100644 index 0000000000..02b77170ee --- /dev/null +++ b/media/libvpx/missing_header.patch @@ -0,0 +1,12 @@ +Add missing header for EBUSY + +--- a/vpx_util/vpx_pthread.h ++++ b/vpx_util/vpx_pthread.h +@@ -26,6 +26,7 @@ extern "C" { + #define NOMINMAX + #undef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN ++#include <errno.h> // NOLINT + #include <process.h> // NOLINT + #include <stddef.h> // NOLINT + #include <windows.h> // NOLINT diff --git a/media/libvpx/moz.build b/media/libvpx/moz.build index 582bc6fd5d..635b5d0fdd 100644 --- a/media/libvpx/moz.build +++ b/media/libvpx/moz.build @@ -72,7 +72,10 @@ elif CONFIG['TARGET_CPU'] == 'arm': ] elif CONFIG['TARGET_CPU'] == 'aarch64' and CONFIG['OS_TARGET'] == 'WINNT': EXPORTS.vpx += files['ARM64_EXPORTS'] - SOURCES += files['ARM64_SOURCES'] + # Bug 1885585: clang on win/aarch64 cannot compile SVInt8_t type for now. + SOURCES += [ + f for f in files['ARM64_SOURCES'] if not f.endswith('_sve.c') + ] ASFLAGS += [ '-I%s/media/libvpx/config/win/aarch64/' % TOPSRCDIR ] LOCAL_INCLUDES += [ '/media/libvpx/config/win/aarch64/' ] SOURCES += [ '/media/libvpx/config/win/aarch64/vpx_config.c' ] @@ -125,6 +128,10 @@ for f in SOURCES: SOURCES[f].flags += ['-march=armv8.2-a+dotprod'] if 'neon_i8mm.c' in f: SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm'] + if 'sve.c' in f: + SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm+sve'] + if 'sve2.c' in f: + SOURCES[f].flags += ['-march=armv9-a+sve2'] # Suppress warnings in third-party code. CFLAGS += [ diff --git a/media/libvpx/moz.yaml b/media/libvpx/moz.yaml index 17704a1905..0b3ec52482 100644 --- a/media/libvpx/moz.yaml +++ b/media/libvpx/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: f6b7166a2b6bac544c2c487d3a7e49bc265cdf9d (Tue Jan 02 20:08:06 2024). + release: 7fb8ceccf92c35cd5131b05c0502916715ebc76b (Fri Mar 15 01:11:50 2024). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: f6b7166a2b6bac544c2c487d3a7e49bc265cdf9d + revision: 7fb8ceccf92c35cd5131b05c0502916715ebc76b # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ @@ -53,8 +53,10 @@ vendoring: - tools/ patches: + - arm_cpu_runtime_detection_code_on_openbsd.patch - input_frame_validation.patch - input_frame_validation_vp9.patch + - missing_header.patch update-actions: - action: move-file diff --git a/media/libvpx/sources.mozbuild b/media/libvpx/sources.mozbuild index 2960dee255..1ad5d4447c 100644 --- a/media/libvpx/sources.mozbuild +++ b/media/libvpx/sources.mozbuild @@ -934,6 +934,7 @@ files = { 'libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c', 'libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c', 'libvpx/vp9/encoder/arm/neon/vp9_error_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_error_sve.c', 'libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c', 'libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c', 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', @@ -1006,6 +1007,7 @@ files = { 'libvpx/vpx_dsp/arm/subpel_variance_neon.c', 'libvpx/vpx_dsp/arm/subtract_neon.c', 'libvpx/vpx_dsp/arm/sum_squares_neon.c', + 'libvpx/vpx_dsp/arm/sum_squares_sve.c', 'libvpx/vpx_dsp/arm/variance_neon.c', 'libvpx/vpx_dsp/arm/variance_neon_dotprod.c', 'libvpx/vpx_dsp/arm/vpx_convolve8_neon.c', @@ -1014,8 +1016,6 @@ files = { 'libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c', 'libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c', 'libvpx/vpx_dsp/arm/vpx_convolve_neon.c', - 'libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c', - 'libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c', 'libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c', 'libvpx/vpx_dsp/avg.c', 'libvpx/vpx_dsp/bitreader.c', |