summaryrefslogtreecommitdiffstats
path: root/media
diff options
context:
space:
mode:
Diffstat (limited to 'media')
-rw-r--r--media/ffvpx/README_MOZILLA4
-rw-r--r--media/ffvpx/config_components_audio_only.h4
-rw-r--r--media/ffvpx/config_components_audio_video.h4
-rw-r--r--media/ffvpx/libavcodec/audio_frame_queue.c113
-rw-r--r--media/ffvpx/libavcodec/audio_frame_queue.h83
-rw-r--r--media/ffvpx/libavcodec/codec_list.c6
-rw-r--r--media/ffvpx/libavcodec/libopusenc.c610
-rw-r--r--media/ffvpx/libavcodec/libvorbisenc.c393
-rw-r--r--media/ffvpx/libavcodec/moz.build3
-rw-r--r--media/ffvpx/libavutil/avutil.symbols1
-rw-r--r--media/ffvpx/opusenc-dtx.patch63
-rw-r--r--media/libaom/0001-errno.patch22
-rw-r--r--media/libaom/0002-mmloadusi64.patch79
-rw-r--r--media/libaom/config/generic/config/aom_config.asm2
-rw-r--r--media/libaom/config/generic/config/aom_config.h2
-rw-r--r--media/libaom/config/generic/config/aom_dsp_rtcd.h9
-rw-r--r--media/libaom/config/generic/config/aom_scale_rtcd.h12
-rw-r--r--media/libaom/config/linux/arm/config/aom_config.asm2
-rw-r--r--media/libaom/config/linux/arm/config/aom_config.h2
-rw-r--r--media/libaom/config/linux/arm/config/aom_dsp_rtcd.h9
-rw-r--r--media/libaom/config/linux/arm/config/aom_scale_rtcd.h12
-rw-r--r--media/libaom/config/linux/ia32/config/aom_config.asm2
-rw-r--r--media/libaom/config/linux/ia32/config/aom_config.h2
-rw-r--r--media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h130
-rw-r--r--media/libaom/config/linux/ia32/config/aom_scale_rtcd.h12
-rw-r--r--media/libaom/config/linux/ia32/config/av1_rtcd.h26
-rw-r--r--media/libaom/config/linux/x64/config/aom_config.asm2
-rw-r--r--media/libaom/config/linux/x64/config/aom_config.h2
-rw-r--r--media/libaom/config/linux/x64/config/aom_dsp_rtcd.h179
-rw-r--r--media/libaom/config/linux/x64/config/aom_scale_rtcd.h12
-rw-r--r--media/libaom/config/linux/x64/config/av1_rtcd.h66
-rw-r--r--media/libaom/config/mac/x64/config/aom_config.asm2
-rw-r--r--media/libaom/config/mac/x64/config/aom_config.h2
-rw-r--r--media/libaom/config/mac/x64/config/aom_dsp_rtcd.h179
-rw-r--r--media/libaom/config/mac/x64/config/aom_scale_rtcd.h12
-rw-r--r--media/libaom/config/mac/x64/config/av1_rtcd.h66
-rw-r--r--media/libaom/config/win/ia32/config/aom_config.asm2
-rw-r--r--media/libaom/config/win/ia32/config/aom_config.h2
-rw-r--r--media/libaom/config/win/ia32/config/aom_dsp_rtcd.h130
-rw-r--r--media/libaom/config/win/ia32/config/aom_scale_rtcd.h12
-rw-r--r--media/libaom/config/win/ia32/config/av1_rtcd.h26
-rw-r--r--media/libaom/config/win/x64/config/aom_config.asm2
-rw-r--r--media/libaom/config/win/x64/config/aom_config.h2
-rw-r--r--media/libaom/config/win/x64/config/aom_dsp_rtcd.h179
-rw-r--r--media/libaom/config/win/x64/config/aom_scale_rtcd.h12
-rw-r--r--media/libaom/config/win/x64/config/av1_rtcd.h66
-rw-r--r--media/libaom/moz.yaml8
-rw-r--r--media/libaom/sources.mozbuild15
-rw-r--r--media/libcubeb/0004-audiounit-ios-compile-fixes.patch1415
-rw-r--r--media/libcubeb/0005-aaudio-timing-fix.patch57
-rw-r--r--media/libcubeb/moz.yaml2
-rw-r--r--media/libcubeb/src/cubeb_aaudio.cpp2
-rw-r--r--media/libcubeb/src/cubeb_audiounit.cpp190
-rw-r--r--media/libcubeb/src/cubeb_triple_buffer.h7
-rw-r--r--media/libcubeb/src/moz.build2
-rw-r--r--media/libcubeb/test/test_triple_buffer.cpp3
-rw-r--r--media/libdav1d/config.h3
-rw-r--r--media/libdav1d/moz.yaml4
-rw-r--r--media/libdav1d/vcs_version.h2
-rw-r--r--media/libjxl/moz.yaml4
-rw-r--r--media/libopus/celt/arm/armcpu.c52
-rw-r--r--media/libopus/celt/x86/x86cpu.h18
-rw-r--r--media/libopus/moz.build2
-rw-r--r--media/libopus/moz.yaml4
-rw-r--r--media/libopus/silk/x86/NSQ_del_dec_avx2.c15
-rw-r--r--media/libopus/src/opus_private.h2
-rw-r--r--media/libopus/src/repacketizer.c3
-rw-r--r--media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch41
-rw-r--r--media/libvpx/config/generic/vpx_config.asm1
-rw-r--r--media/libvpx/config/generic/vpx_config.c2
-rw-r--r--media/libvpx/config/generic/vpx_config.h1
-rw-r--r--media/libvpx/config/linux/arm/vpx_config.asm1
-rw-r--r--media/libvpx/config/linux/arm/vpx_config.c2
-rw-r--r--media/libvpx/config/linux/arm/vpx_config.h1
-rw-r--r--media/libvpx/config/linux/arm64/vp9_rtcd.h10
-rw-r--r--media/libvpx/config/linux/arm64/vpx_config.asm1
-rw-r--r--media/libvpx/config/linux/arm64/vpx_config.c2
-rw-r--r--media/libvpx/config/linux/arm64/vpx_config.h1
-rw-r--r--media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h5
-rw-r--r--media/libvpx/config/linux/ia32/vpx_config.asm1
-rw-r--r--media/libvpx/config/linux/ia32/vpx_config.c2
-rw-r--r--media/libvpx/config/linux/ia32/vpx_config.h1
-rw-r--r--media/libvpx/config/linux/x64/vpx_config.asm1
-rw-r--r--media/libvpx/config/linux/x64/vpx_config.c2
-rw-r--r--media/libvpx/config/linux/x64/vpx_config.h1
-rw-r--r--media/libvpx/config/mac/ia32/vpx_config.asm1
-rw-r--r--media/libvpx/config/mac/ia32/vpx_config.c2
-rw-r--r--media/libvpx/config/mac/ia32/vpx_config.h1
-rw-r--r--media/libvpx/config/mac/x64/vpx_config.asm1
-rw-r--r--media/libvpx/config/mac/x64/vpx_config.c2
-rw-r--r--media/libvpx/config/mac/x64/vpx_config.h1
-rw-r--r--media/libvpx/config/win/aarch64/vpx_config.asm3
-rw-r--r--media/libvpx/config/win/aarch64/vpx_config.c2
-rw-r--r--media/libvpx/config/win/aarch64/vpx_config.h3
-rwxr-xr-xmedia/libvpx/config/win/ia32/vpx_config.asm1
-rw-r--r--media/libvpx/config/win/ia32/vpx_config.c2
-rw-r--r--media/libvpx/config/win/ia32/vpx_config.h1
-rw-r--r--media/libvpx/config/win/x64/vpx_config.asm1
-rw-r--r--media/libvpx/config/win/x64/vpx_config.c2
-rw-r--r--media/libvpx/config/win/x64/vpx_config.h1
-rwxr-xr-xmedia/libvpx/generate_sources_mozbuild.sh9
-rw-r--r--media/libvpx/input_frame_validation.patch14
-rw-r--r--media/libvpx/libvpx/.mailmap3
-rw-r--r--media/libvpx/libvpx/AUTHORS11
-rw-r--r--media/libvpx/libvpx/CHANGELOG76
-rw-r--r--media/libvpx/libvpx/README40
-rw-r--r--media/libvpx/libvpx/build/make/Android.mk13
-rw-r--r--media/libvpx/libvpx/build/make/Makefile2
-rw-r--r--media/libvpx/libvpx/build/make/configure.sh109
-rwxr-xr-xmedia/libvpx/libvpx/build/make/rtcd.pl2
-rwxr-xr-xmedia/libvpx/libvpx/configure7
-rw-r--r--media/libvpx/libvpx/examples/resize_util.c2
-rw-r--r--media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c9
-rw-r--r--media/libvpx/libvpx/examples/vp9cx_set_ref.c2
-rw-r--r--media/libvpx/libvpx/libs.doxy_template8
-rw-r--r--media/libvpx/libvpx/libs.mk4
-rw-r--r--media/libvpx/libvpx/test/android/get_files.py17
-rw-r--r--media/libvpx/libvpx/test/avg_test.cc9
-rw-r--r--media/libvpx/libvpx/test/codec_factory.h8
-rw-r--r--media/libvpx/libvpx/test/convolve_test.cc86
-rw-r--r--media/libvpx/libvpx/test/encode_api_test.cc418
-rw-r--r--media/libvpx/libvpx/test/frame_size_tests.cc2
-rw-r--r--media/libvpx/libvpx/test/init_vpx_test.cc3
-rw-r--r--media/libvpx/libvpx/test/resize_test.cc10
-rw-r--r--media/libvpx/libvpx/test/sum_squares_test.cc7
-rw-r--r--media/libvpx/libvpx/test/variance_test.cc261
-rw-r--r--media/libvpx/libvpx/test/video_source.h2
-rw-r--r--media/libvpx/libvpx/test/vp8_datarate_test.cc25
-rw-r--r--media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc7
-rw-r--r--media/libvpx/libvpx/test/vp9_block_error_test.cc9
-rw-r--r--media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc987
-rw-r--r--media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc3
-rw-r--r--media/libvpx/libvpx/test/vp9_scale_test.cc9
-rw-r--r--media/libvpx/libvpx/tools_common.c36
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c2
-rw-r--r--media/libvpx/libvpx/vp8/common/entropy.c2
-rw-r--r--media/libvpx/libvpx/vp8/common/generic/systemdependent.c41
-rw-r--r--media/libvpx/libvpx/vp8/common/onyx.h2
-rw-r--r--media/libvpx/libvpx/vp8/common/rtcd.c2
-rw-r--r--media/libvpx/libvpx/vp8/common/threading.h153
-rw-r--r--media/libvpx/libvpx/vp8/decoder/onyxd_if.c2
-rw-r--r--media/libvpx/libvpx/vp8/decoder/onyxd_int.h5
-rw-r--r--media/libvpx/libvpx/vp8/decoder/threading.c33
-rw-r--r--media/libvpx/libvpx/vp8/encoder/encodeframe.c46
-rw-r--r--media/libvpx/libvpx/vp8/encoder/ethreading.c63
-rw-r--r--media/libvpx/libvpx/vp8/encoder/onyx_if.c48
-rw-r--r--media/libvpx/libvpx/vp8/encoder/onyx_int.h9
-rw-r--r--media/libvpx/libvpx/vp8/encoder/ratectrl.c29
-rw-r--r--media/libvpx/libvpx/vp8/encoder/tokenize.h2
-rw-r--r--media/libvpx/libvpx/vp8/vp8_cx_iface.c84
-rw-r--r--media/libvpx/libvpx/vp8/vp8_dx_iface.c2
-rw-r--r--media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc13
-rw-r--r--media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h10
-rw-r--r--media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h1
-rw-r--r--media/libvpx/libvpx/vp9/common/vp9_rtcd.c2
-rw-r--r--media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl6
-rw-r--r--media/libvpx/libvpx/vp9/common/vp9_thread_common.c1
-rw-r--r--media/libvpx/libvpx/vp9/common/vp9_thread_common.h1
-rw-r--r--media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c2
-rw-r--r--media/libvpx/libvpx/vp9/decoder/vp9_decoder.c2
-rw-r--r--media/libvpx/libvpx/vp9/decoder/vp9_decoder.h1
-rw-r--r--media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c1
-rw-r--r--media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h2
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c78
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_block.h2
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c6
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h2
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c107
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encoder.c174
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encoder.h8
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ethread.c13
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ethread.h3
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c52
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h9
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_extend.c39
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_extend.h3
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c114
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c97
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c1
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_quantize.c1
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c81
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h6
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c2
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c74
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h1
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c16
-rw-r--r--media/libvpx/libvpx/vp9/ratectrl_rtc.cc2
-rw-r--r--media/libvpx/libvpx/vp9/ratectrl_rtc.h35
-rw-r--r--media/libvpx/libvpx/vp9/simple_encode.cc12
-rw-r--r--media/libvpx/libvpx/vp9/vp9_cx_iface.c102
-rw-r--r--media/libvpx/libvpx/vp9/vp9_dx_iface.c1
-rw-r--r--media/libvpx/libvpx/vp9/vp9cx.mk1
-rw-r--r--media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h8
-rw-r--r--media/libvpx/libvpx/vpx/src/vpx_encoder.c9
-rw-r--r--media/libvpx/libvpx/vpx/src/vpx_image.c4
-rw-r--r--media/libvpx/libvpx/vpx/src/vpx_tpl.c6
-rw-r--r--media/libvpx/libvpx/vpx/vp8cx.h2
-rw-r--r--media/libvpx/libvpx/vpx/vpx_encoder.h18
-rw-r--r--media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h24
-rw-r--r--media/libvpx/libvpx/vpx/vpx_tpl.h22
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c68
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c344
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c1905
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c351
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c452
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c58
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c2
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h201
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c73
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h72
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c897
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h449
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c1428
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c1250
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c58
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c66
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c66
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h32
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h51
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c445
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk8
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c2
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl124
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_filter.h1
-rw-r--r--media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c21
-rw-r--r--media/libvpx/libvpx/vpx_ports/arm.h2
-rw-r--r--media/libvpx/libvpx/vpx_ports/emms_mmx.c2
-rw-r--r--media/libvpx/libvpx/vpx_ports/mem.h8
-rw-r--r--media/libvpx/libvpx/vpx_ports/vpx_once.h23
-rw-r--r--media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c2
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_pthread.h157
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_thread.c93
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_thread.h366
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_util.mk1
-rw-r--r--media/libvpx/missing_header.patch12
-rw-r--r--media/libvpx/moz.build9
-rw-r--r--media/libvpx/moz.yaml6
-rw-r--r--media/libvpx/sources.mozbuild4
238 files changed, 10624 insertions, 6869 deletions
diff --git a/media/ffvpx/README_MOZILLA b/media/ffvpx/README_MOZILLA
index 1c00f2761a..b766be4abd 100644
--- a/media/ffvpx/README_MOZILLA
+++ b/media/ffvpx/README_MOZILLA
@@ -158,6 +158,8 @@ There are going to be a lot of changes in terms of symbols exported. Adjust
`libavutil/avutil.symbols` and `libavcodec/avcodec.symbols` by removing and
adding symbols until the build passes.
-Finally, apply the patch:
+Finally, apply the patches:
- no-unicode-stdio.patch to avoid passing the infity symbol in unicode to an
stdio.h function, that causes bug 1879740 issue on Windows.
+- opusenc-dtx.patch to allow enabling DTX in the opus encoder.
+
diff --git a/media/ffvpx/config_components_audio_only.h b/media/ffvpx/config_components_audio_only.h
index 0e61e23898..4ba265a9f7 100644
--- a/media/ffvpx/config_components_audio_only.h
+++ b/media/ffvpx/config_components_audio_only.h
@@ -787,7 +787,7 @@
#define CONFIG_LIBMP3LAME_ENCODER 0
#define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
#define CONFIG_LIBOPENJPEG_ENCODER 0
-#define CONFIG_LIBOPUS_ENCODER 0
+#define CONFIG_LIBOPUS_ENCODER 1
#define CONFIG_LIBRAV1E_ENCODER 0
#define CONFIG_LIBSHINE_ENCODER 0
#define CONFIG_LIBSPEEX_ENCODER 0
@@ -795,7 +795,7 @@
#define CONFIG_LIBTHEORA_ENCODER 0
#define CONFIG_LIBTWOLAME_ENCODER 0
#define CONFIG_LIBVO_AMRWBENC_ENCODER 0
-#define CONFIG_LIBVORBIS_ENCODER 0
+#define CONFIG_LIBVORBIS_ENCODER 1
#define CONFIG_LIBVPX_VP8_ENCODER 0
#define CONFIG_LIBVPX_VP9_ENCODER 0
#define CONFIG_LIBWEBP_ANIM_ENCODER 0
diff --git a/media/ffvpx/config_components_audio_video.h b/media/ffvpx/config_components_audio_video.h
index c8423a895e..220eb6ca52 100644
--- a/media/ffvpx/config_components_audio_video.h
+++ b/media/ffvpx/config_components_audio_video.h
@@ -810,7 +810,7 @@
#define CONFIG_LIBMP3LAME_ENCODER 0
#define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
#define CONFIG_LIBOPENJPEG_ENCODER 0
-#define CONFIG_LIBOPUS_ENCODER 0
+#define CONFIG_LIBOPUS_ENCODER 1
#define CONFIG_LIBRAV1E_ENCODER 0
#define CONFIG_LIBSHINE_ENCODER 0
#define CONFIG_LIBSPEEX_ENCODER 0
@@ -818,7 +818,7 @@
#define CONFIG_LIBTHEORA_ENCODER 0
#define CONFIG_LIBTWOLAME_ENCODER 0
#define CONFIG_LIBVO_AMRWBENC_ENCODER 0
-#define CONFIG_LIBVORBIS_ENCODER 0
+#define CONFIG_LIBVORBIS_ENCODER 1
#define CONFIG_LIBVPX_VP8_ENCODER 1
#define CONFIG_LIBVPX_VP9_ENCODER 1
#define CONFIG_LIBWEBP_ANIM_ENCODER 0
diff --git a/media/ffvpx/libavcodec/audio_frame_queue.c b/media/ffvpx/libavcodec/audio_frame_queue.c
new file mode 100644
index 0000000000..08b4b368c7
--- /dev/null
+++ b/media/ffvpx/libavcodec/audio_frame_queue.c
@@ -0,0 +1,113 @@
+/*
+ * Audio Frame Queue
+ * Copyright (c) 2012 Justin Ruggles
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "audio_frame_queue.h"
+#include "encode.h"
+#include "libavutil/avassert.h"
+
+av_cold void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq)
+{
+ afq->avctx = avctx;
+ afq->remaining_delay = avctx->initial_padding;
+ afq->remaining_samples = avctx->initial_padding;
+ afq->frame_count = 0;
+}
+
+void ff_af_queue_close(AudioFrameQueue *afq)
+{
+ if(afq->frame_count)
+ av_log(afq->avctx, AV_LOG_WARNING, "%d frames left in the queue on closing\n", afq->frame_count);
+ av_freep(&afq->frames);
+ memset(afq, 0, sizeof(*afq));
+}
+
+int ff_af_queue_add(AudioFrameQueue *afq, const AVFrame *f)
+{
+ AudioFrame *new = av_fast_realloc(afq->frames, &afq->frame_alloc, sizeof(*afq->frames)*(afq->frame_count+1));
+ if(!new)
+ return AVERROR(ENOMEM);
+ afq->frames = new;
+ new += afq->frame_count;
+
+ /* get frame parameters */
+ new->duration = f->nb_samples;
+ new->duration += afq->remaining_delay;
+ if (f->pts != AV_NOPTS_VALUE) {
+ new->pts = av_rescale_q(f->pts,
+ afq->avctx->time_base,
+ (AVRational){ 1, afq->avctx->sample_rate });
+ new->pts -= afq->remaining_delay;
+ if(afq->frame_count && new[-1].pts >= new->pts)
+ av_log(afq->avctx, AV_LOG_WARNING, "Queue input is backward in time\n");
+ } else {
+ new->pts = AV_NOPTS_VALUE;
+ }
+ afq->remaining_delay = 0;
+
+ /* add frame sample count */
+ afq->remaining_samples += f->nb_samples;
+
+ afq->frame_count++;
+
+ return 0;
+}
+
+void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts,
+ int64_t *duration)
+{
+ int64_t out_pts = AV_NOPTS_VALUE;
+ int removed_samples = 0;
+ int i;
+
+ if (afq->frame_count || afq->frame_alloc) {
+ if (afq->frames->pts != AV_NOPTS_VALUE)
+ out_pts = afq->frames->pts;
+ }
+ if(!afq->frame_count)
+ av_log(afq->avctx, AV_LOG_WARNING, "Trying to remove %d samples, but the queue is empty\n", nb_samples);
+ if (pts)
+ *pts = ff_samples_to_time_base(afq->avctx, out_pts);
+
+ for(i=0; nb_samples && i<afq->frame_count; i++){
+ int n= FFMIN(afq->frames[i].duration, nb_samples);
+ afq->frames[i].duration -= n;
+ nb_samples -= n;
+ removed_samples += n;
+ if(afq->frames[i].pts != AV_NOPTS_VALUE)
+ afq->frames[i].pts += n;
+ }
+ afq->remaining_samples -= removed_samples;
+ i -= i && afq->frames[i-1].duration;
+ memmove(afq->frames, afq->frames + i, sizeof(*afq->frames) * (afq->frame_count - i));
+ afq->frame_count -= i;
+
+ if(nb_samples){
+ av_assert0(!afq->frame_count);
+ av_assert0(afq->remaining_samples == afq->remaining_delay);
+ if(afq->frames && afq->frames[0].pts != AV_NOPTS_VALUE)
+ afq->frames[0].pts += nb_samples;
+ av_log(afq->avctx, AV_LOG_DEBUG, "Trying to remove %d more samples than there are in the queue\n", nb_samples);
+ }
+ if (duration)
+ *duration = ff_samples_to_time_base(afq->avctx, removed_samples);
+}
diff --git a/media/ffvpx/libavcodec/audio_frame_queue.h b/media/ffvpx/libavcodec/audio_frame_queue.h
new file mode 100644
index 0000000000..d8076eae54
--- /dev/null
+++ b/media/ffvpx/libavcodec/audio_frame_queue.h
@@ -0,0 +1,83 @@
+/*
+ * Audio Frame Queue
+ * Copyright (c) 2012 Justin Ruggles
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AUDIO_FRAME_QUEUE_H
+#define AVCODEC_AUDIO_FRAME_QUEUE_H
+
+#include "avcodec.h"
+
+typedef struct AudioFrame {
+ int64_t pts;
+ int duration;
+} AudioFrame;
+
+typedef struct AudioFrameQueue {
+ AVCodecContext *avctx;
+ int remaining_delay;
+ int remaining_samples;
+ AudioFrame *frames;
+ unsigned frame_count;
+ unsigned frame_alloc;
+} AudioFrameQueue;
+
+/**
+ * Initialize AudioFrameQueue.
+ *
+ * @param avctx context to use for time_base and av_log
+ * @param afq queue context
+ */
+void ff_af_queue_init(AVCodecContext *avctx, AudioFrameQueue *afq);
+
+/**
+ * Close AudioFrameQueue.
+ *
+ * Frees memory if needed.
+ *
+ * @param afq queue context
+ */
+void ff_af_queue_close(AudioFrameQueue *afq);
+
+/**
+ * Add a frame to the queue.
+ *
+ * @param afq queue context
+ * @param f frame to add to the queue
+ */
+int ff_af_queue_add(AudioFrameQueue *afq, const AVFrame *f);
+
+/**
+ * Remove frame(s) from the queue.
+ *
+ * Retrieves the pts of the next available frame, or a generated pts based on
+ * the last frame duration if there are no frames left in the queue. The number
+ * of requested samples should be the full number of samples represented by the
+ * packet that will be output by the encoder. If fewer samples are available
+ * in the queue, a smaller value will be used for the output duration.
+ *
+ * @param afq queue context
+ * @param nb_samples number of samples to remove from the queue
+ * @param[out] pts output packet pts
+ * @param[out] duration output packet duration
+ */
+void ff_af_queue_remove(AudioFrameQueue *afq, int nb_samples, int64_t *pts,
+ int64_t *duration);
+
+#endif /* AVCODEC_AUDIO_FRAME_QUEUE_H */
diff --git a/media/ffvpx/libavcodec/codec_list.c b/media/ffvpx/libavcodec/codec_list.c
index 04259e3cd7..7c6b0ceacd 100644
--- a/media/ffvpx/libavcodec/codec_list.c
+++ b/media/ffvpx/libavcodec/codec_list.c
@@ -20,6 +20,9 @@ static const FFCodec * const codec_list[] = {
#if CONFIG_LIBVORBIS_DECODER
&ff_libvorbis_decoder,
#endif
+#if CONFIG_LIBVORBIS_ENCODER
+ &ff_libvorbis_encoder,
+#endif
#if CONFIG_PCM_ALAW_DECODER
&ff_pcm_alaw_decoder,
#endif
@@ -44,6 +47,9 @@ static const FFCodec * const codec_list[] = {
#if CONFIG_LIBOPUS_DECODER
&ff_libopus_decoder,
#endif
+#if CONFIG_LIBOPUS_ENCODER
+ &ff_libopus_encoder,
+#endif
#if CONFIG_LIBVPX_VP8_DECODER
&ff_libvpx_vp8_decoder,
#endif
diff --git a/media/ffvpx/libavcodec/libopusenc.c b/media/ffvpx/libavcodec/libopusenc.c
new file mode 100644
index 0000000000..68667e3350
--- /dev/null
+++ b/media/ffvpx/libavcodec/libopusenc.c
@@ -0,0 +1,610 @@
+/*
+ * Opus encoder using libopus
+ * Copyright (c) 2012 Nathan Caldwell
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <opus.h>
+#include <opus_multistream.h>
+
+#include "libavutil/channel_layout.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "bytestream.h"
+#include "codec_internal.h"
+#include "encode.h"
+#include "libopus.h"
+#include "audio_frame_queue.h"
+#include "vorbis_data.h"
+
+typedef struct LibopusEncOpts {
+ int vbr;
+ int application;
+ int packet_loss;
+ int fec;
+ int complexity;
+ float frame_duration;
+ int packet_size;
+ int max_bandwidth;
+ int mapping_family;
+ int dtx;
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+ int apply_phase_inv;
+#endif
+} LibopusEncOpts;
+
+typedef struct LibopusEncContext {
+ AVClass *class;
+ OpusMSEncoder *enc;
+ int stream_count;
+ uint8_t *samples;
+ LibopusEncOpts opts;
+ AudioFrameQueue afq;
+ const uint8_t *encoder_channel_map;
+} LibopusEncContext;
+
+static const uint8_t opus_coupled_streams[8] = {
+ 0, 1, 1, 2, 2, 2, 2, 3
+};
+
+/* Opus internal to Vorbis channel order mapping written in the header */
+static const uint8_t opus_vorbis_channel_map[8][8] = {
+ { 0 },
+ { 0, 1 },
+ { 0, 2, 1 },
+ { 0, 1, 2, 3 },
+ { 0, 4, 1, 2, 3 },
+ { 0, 4, 1, 2, 3, 5 },
+ { 0, 4, 1, 2, 3, 5, 6 },
+ { 0, 6, 1, 2, 3, 4, 5, 7 },
+};
+
+/* libavcodec to libopus channel order mapping, passed to libopus */
+static const uint8_t libavcodec_libopus_channel_map[8][8] = {
+ { 0 },
+ { 0, 1 },
+ { 0, 1, 2 },
+ { 0, 1, 2, 3 },
+ { 0, 1, 3, 4, 2 },
+ { 0, 1, 4, 5, 2, 3 },
+ { 0, 1, 5, 6, 2, 4, 3 },
+ { 0, 1, 6, 7, 4, 5, 2, 3 },
+};
+
+static void libopus_write_header(AVCodecContext *avctx, int stream_count,
+ int coupled_stream_count,
+ int mapping_family,
+ const uint8_t *channel_mapping)
+{
+ uint8_t *p = avctx->extradata;
+ int channels = avctx->ch_layout.nb_channels;
+
+ bytestream_put_buffer(&p, "OpusHead", 8);
+ bytestream_put_byte(&p, 1); /* Version */
+ bytestream_put_byte(&p, channels);
+ bytestream_put_le16(&p, avctx->initial_padding * 48000 / avctx->sample_rate); /* Lookahead samples at 48kHz */
+ bytestream_put_le32(&p, avctx->sample_rate); /* Original sample rate */
+ bytestream_put_le16(&p, 0); /* Gain of 0dB is recommended. */
+
+ /* Channel mapping */
+ bytestream_put_byte(&p, mapping_family);
+ if (mapping_family != 0) {
+ bytestream_put_byte(&p, stream_count);
+ bytestream_put_byte(&p, coupled_stream_count);
+ bytestream_put_buffer(&p, channel_mapping, channels);
+ }
+}
+
+static int libopus_configure_encoder(AVCodecContext *avctx, OpusMSEncoder *enc,
+ LibopusEncOpts *opts)
+{
+ int ret;
+
+ if (avctx->global_quality) {
+ av_log(avctx, AV_LOG_ERROR,
+ "Quality-based encoding not supported, "
+ "please specify a bitrate and VBR setting.\n");
+ return AVERROR(EINVAL);
+ }
+
+ ret = opus_multistream_encoder_ctl(enc, OPUS_SET_BITRATE(avctx->bit_rate));
+ if (ret != OPUS_OK) {
+ av_log(avctx, AV_LOG_ERROR,
+ "Failed to set bitrate: %s\n", opus_strerror(ret));
+ return ret;
+ }
+
+ ret = opus_multistream_encoder_ctl(enc,
+ OPUS_SET_COMPLEXITY(opts->complexity));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to set complexity: %s\n", opus_strerror(ret));
+
+ ret = opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(!!opts->vbr));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to set VBR: %s\n", opus_strerror(ret));
+
+ ret = opus_multistream_encoder_ctl(enc,
+ OPUS_SET_VBR_CONSTRAINT(opts->vbr == 2));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to set constrained VBR: %s\n", opus_strerror(ret));
+
+ ret = opus_multistream_encoder_ctl(enc,
+ OPUS_SET_PACKET_LOSS_PERC(opts->packet_loss));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to set expected packet loss percentage: %s\n",
+ opus_strerror(ret));
+
+ ret = opus_multistream_encoder_ctl(enc,
+ OPUS_SET_INBAND_FEC(opts->fec));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to set inband FEC: %s\n",
+ opus_strerror(ret));
+
+ ret = opus_multistream_encoder_ctl(enc,
+ OPUS_SET_DTX(opts->dtx));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to set DTX: %s\n",
+ opus_strerror(ret));
+
+ if (avctx->cutoff) {
+ ret = opus_multistream_encoder_ctl(enc,
+ OPUS_SET_MAX_BANDWIDTH(opts->max_bandwidth));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to set maximum bandwidth: %s\n", opus_strerror(ret));
+ }
+
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+ ret = opus_multistream_encoder_ctl(enc,
+ OPUS_SET_PHASE_INVERSION_DISABLED(!opts->apply_phase_inv));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to set phase inversion: %s\n",
+ opus_strerror(ret));
+#endif
+ return OPUS_OK;
+}
+
+static int libopus_check_max_channels(AVCodecContext *avctx,
+ int max_channels) {
+ if (avctx->ch_layout.nb_channels > max_channels) {
+ av_log(avctx, AV_LOG_ERROR, "Opus mapping family undefined for %d channels.\n",
+ avctx->ch_layout.nb_channels);
+ return AVERROR(EINVAL);
+ }
+
+ return 0;
+}
+
+static int libopus_check_vorbis_layout(AVCodecContext *avctx, int mapping_family) {
+ av_assert2(avctx->ch_layout.nb_channels < FF_ARRAY_ELEMS(ff_vorbis_ch_layouts));
+
+ if (avctx->ch_layout.order == AV_CHANNEL_ORDER_UNSPEC) {
+ av_log(avctx, AV_LOG_WARNING,
+ "No channel layout specified. Opus encoder will use Vorbis "
+ "channel layout for %d channels.\n", avctx->ch_layout.nb_channels);
+ } else if (av_channel_layout_compare(&avctx->ch_layout, &ff_vorbis_ch_layouts[avctx->ch_layout.nb_channels - 1])) {
+ char name[32];
+
+ av_channel_layout_describe(&avctx->ch_layout, name, sizeof(name));
+ av_log(avctx, AV_LOG_ERROR,
+ "Invalid channel layout %s for specified mapping family %d.\n",
+ name, mapping_family);
+
+ return AVERROR(EINVAL);
+ }
+
+ return 0;
+}
+
+static int libopus_validate_layout_and_get_channel_map(
+ AVCodecContext *avctx,
+ int mapping_family,
+ const uint8_t ** channel_map_result)
+{
+ const uint8_t * channel_map = NULL;
+ int ret;
+
+ switch (mapping_family) {
+ case -1:
+ ret = libopus_check_max_channels(avctx, 8);
+ if (ret == 0) {
+ ret = libopus_check_vorbis_layout(avctx, mapping_family);
+ /* Channels do not need to be reordered. */
+ }
+
+ break;
+ case 0:
+ ret = libopus_check_max_channels(avctx, 2);
+ if (ret == 0) {
+ ret = libopus_check_vorbis_layout(avctx, mapping_family);
+ }
+ break;
+ case 1:
+ /* Opus expects channels to be in Vorbis order. */
+ ret = libopus_check_max_channels(avctx, 8);
+ if (ret == 0) {
+ ret = libopus_check_vorbis_layout(avctx, mapping_family);
+ channel_map = ff_vorbis_channel_layout_offsets[avctx->ch_layout.nb_channels - 1];
+ }
+ break;
+ case 255:
+ ret = libopus_check_max_channels(avctx, 254);
+ break;
+ default:
+ av_log(avctx, AV_LOG_WARNING,
+ "Unknown channel mapping family %d. Output channel layout may be invalid.\n",
+ mapping_family);
+ ret = 0;
+ }
+
+ *channel_map_result = channel_map;
+ return ret;
+}
+
+static av_cold int libopus_encode_init(AVCodecContext *avctx)
+{
+ LibopusEncContext *opus = avctx->priv_data;
+ OpusMSEncoder *enc;
+ uint8_t libopus_channel_mapping[255];
+ int ret = OPUS_OK;
+ int channels = avctx->ch_layout.nb_channels;
+ int av_ret;
+ int coupled_stream_count, header_size, frame_size;
+ int mapping_family;
+
+ frame_size = opus->opts.frame_duration * 48000 / 1000;
+ switch (frame_size) {
+ case 120:
+ case 240:
+ if (opus->opts.application != OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+ av_log(avctx, AV_LOG_WARNING,
+ "LPC mode cannot be used with a frame duration of less "
+ "than 10ms. Enabling restricted low-delay mode.\n"
+ "Use a longer frame duration if this is not what you want.\n");
+ /* Frame sizes less than 10 ms can only use MDCT mode, so switching to
+ * RESTRICTED_LOWDELAY avoids an unnecessary extra 2.5ms lookahead. */
+ opus->opts.application = OPUS_APPLICATION_RESTRICTED_LOWDELAY;
+ case 480:
+ case 960:
+ case 1920:
+ case 2880:
+#ifdef OPUS_FRAMESIZE_120_MS
+ case 3840:
+ case 4800:
+ case 5760:
+#endif
+ opus->opts.packet_size =
+ avctx->frame_size = frame_size * avctx->sample_rate / 48000;
+ break;
+ default:
+ av_log(avctx, AV_LOG_ERROR, "Invalid frame duration: %g.\n"
+ "Frame duration must be exactly one of: 2.5, 5, 10, 20, 40"
+#ifdef OPUS_FRAMESIZE_120_MS
+ ", 60, 80, 100 or 120.\n",
+#else
+ " or 60.\n",
+#endif
+ opus->opts.frame_duration);
+ return AVERROR(EINVAL);
+ }
+
+ if (avctx->compression_level < 0 || avctx->compression_level > 10) {
+ av_log(avctx, AV_LOG_WARNING,
+ "Compression level must be in the range 0 to 10. "
+ "Defaulting to 10.\n");
+ opus->opts.complexity = 10;
+ } else {
+ opus->opts.complexity = avctx->compression_level;
+ }
+
+ if (avctx->cutoff) {
+ switch (avctx->cutoff) {
+ case 4000:
+ opus->opts.max_bandwidth = OPUS_BANDWIDTH_NARROWBAND;
+ break;
+ case 6000:
+ opus->opts.max_bandwidth = OPUS_BANDWIDTH_MEDIUMBAND;
+ break;
+ case 8000:
+ opus->opts.max_bandwidth = OPUS_BANDWIDTH_WIDEBAND;
+ break;
+ case 12000:
+ opus->opts.max_bandwidth = OPUS_BANDWIDTH_SUPERWIDEBAND;
+ break;
+ case 20000:
+ opus->opts.max_bandwidth = OPUS_BANDWIDTH_FULLBAND;
+ break;
+ default:
+ av_log(avctx, AV_LOG_WARNING,
+ "Invalid frequency cutoff: %d. Using default maximum bandwidth.\n"
+ "Cutoff frequency must be exactly one of: 4000, 6000, 8000, 12000 or 20000.\n",
+ avctx->cutoff);
+ avctx->cutoff = 0;
+ }
+ }
+
+ /* Channels may need to be reordered to match opus mapping. */
+ av_ret = libopus_validate_layout_and_get_channel_map(avctx, opus->opts.mapping_family,
+ &opus->encoder_channel_map);
+ if (av_ret) {
+ return av_ret;
+ }
+
+ if (opus->opts.mapping_family == -1) {
+ /* By default, use mapping family 1 for the header but use the older
+ * libopus multistream API to avoid surround masking. */
+
+ /* Set the mapping family so that the value is correct in the header */
+ mapping_family = channels > 2 ? 1 : 0;
+ coupled_stream_count = opus_coupled_streams[channels - 1];
+ opus->stream_count = channels - coupled_stream_count;
+ memcpy(libopus_channel_mapping,
+ opus_vorbis_channel_map[channels - 1],
+ channels * sizeof(*libopus_channel_mapping));
+
+ enc = opus_multistream_encoder_create(
+ avctx->sample_rate, channels, opus->stream_count,
+ coupled_stream_count,
+ libavcodec_libopus_channel_map[channels - 1],
+ opus->opts.application, &ret);
+ } else {
+ /* Use the newer multistream API. The encoder will set the channel
+ * mapping and coupled stream counts to its internal defaults and will
+ * use surround masking analysis to save bits. */
+ mapping_family = opus->opts.mapping_family;
+ enc = opus_multistream_surround_encoder_create(
+ avctx->sample_rate, channels, mapping_family,
+ &opus->stream_count, &coupled_stream_count, libopus_channel_mapping,
+ opus->opts.application, &ret);
+ }
+
+ if (ret != OPUS_OK) {
+ av_log(avctx, AV_LOG_ERROR,
+ "Failed to create encoder: %s\n", opus_strerror(ret));
+ return ff_opus_error_to_averror(ret);
+ }
+
+ if (!avctx->bit_rate) {
+ /* Sane default copied from opusenc */
+ avctx->bit_rate = 64000 * opus->stream_count +
+ 32000 * coupled_stream_count;
+ av_log(avctx, AV_LOG_WARNING,
+ "No bit rate set. Defaulting to %"PRId64" bps.\n", avctx->bit_rate);
+ }
+
+ if (avctx->bit_rate < 500 || avctx->bit_rate > 256000 * channels) {
+ av_log(avctx, AV_LOG_ERROR, "The bit rate %"PRId64" bps is unsupported. "
+ "Please choose a value between 500 and %d.\n", avctx->bit_rate,
+ 256000 * channels);
+ ret = AVERROR(EINVAL);
+ goto fail;
+ }
+
+ ret = libopus_configure_encoder(avctx, enc, &opus->opts);
+ if (ret != OPUS_OK) {
+ ret = ff_opus_error_to_averror(ret);
+ goto fail;
+ }
+
+ /* Header includes channel mapping table if and only if mapping family is NOT 0 */
+ header_size = 19 + (mapping_family == 0 ? 0 : 2 + channels);
+ avctx->extradata = av_malloc(header_size + AV_INPUT_BUFFER_PADDING_SIZE);
+ if (!avctx->extradata) {
+ av_log(avctx, AV_LOG_ERROR, "Failed to allocate extradata.\n");
+ ret = AVERROR(ENOMEM);
+ goto fail;
+ }
+ avctx->extradata_size = header_size;
+
+ opus->samples = av_calloc(frame_size, channels *
+ av_get_bytes_per_sample(avctx->sample_fmt));
+ if (!opus->samples) {
+ av_log(avctx, AV_LOG_ERROR, "Failed to allocate samples buffer.\n");
+ ret = AVERROR(ENOMEM);
+ goto fail;
+ }
+
+ ret = opus_multistream_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&avctx->initial_padding));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to get number of lookahead samples: %s\n",
+ opus_strerror(ret));
+
+ libopus_write_header(avctx, opus->stream_count, coupled_stream_count,
+ mapping_family, libopus_channel_mapping);
+
+ ff_af_queue_init(avctx, &opus->afq);
+
+ opus->enc = enc;
+
+ return 0;
+
+fail:
+ opus_multistream_encoder_destroy(enc);
+ return ret;
+}
+
+static void libopus_copy_samples_with_channel_map(
+ uint8_t *dst, const uint8_t *src, const uint8_t *channel_map,
+ int nb_channels, int nb_samples, int bytes_per_sample) {
+ int sample, channel;
+ for (sample = 0; sample < nb_samples; ++sample) {
+ for (channel = 0; channel < nb_channels; ++channel) {
+ const size_t src_pos = bytes_per_sample * (nb_channels * sample + channel);
+ const size_t dst_pos = bytes_per_sample * (nb_channels * sample + channel_map[channel]);
+
+ memcpy(&dst[dst_pos], &src[src_pos], bytes_per_sample);
+ }
+ }
+}
+
+static int libopus_encode(AVCodecContext *avctx, AVPacket *avpkt,
+ const AVFrame *frame, int *got_packet_ptr)
+{
+ LibopusEncContext *opus = avctx->priv_data;
+ const int bytes_per_sample = av_get_bytes_per_sample(avctx->sample_fmt);
+ const int channels = avctx->ch_layout.nb_channels;
+ const int sample_size = channels * bytes_per_sample;
+ const uint8_t *audio;
+ int ret;
+ int discard_padding;
+
+ if (frame) {
+ ret = ff_af_queue_add(&opus->afq, frame);
+ if (ret < 0)
+ return ret;
+ if (opus->encoder_channel_map != NULL) {
+ audio = opus->samples;
+ libopus_copy_samples_with_channel_map(
+ opus->samples, frame->data[0], opus->encoder_channel_map,
+ channels, frame->nb_samples, bytes_per_sample);
+ } else if (frame->nb_samples < opus->opts.packet_size) {
+ audio = opus->samples;
+ memcpy(opus->samples, frame->data[0], frame->nb_samples * sample_size);
+ } else
+ audio = frame->data[0];
+ } else {
+ if (!opus->afq.remaining_samples || (!opus->afq.frame_alloc && !opus->afq.frame_count))
+ return 0;
+ audio = opus->samples;
+ memset(opus->samples, 0, opus->opts.packet_size * sample_size);
+ }
+
+ /* Maximum packet size taken from opusenc in opus-tools. 120ms packets
+ * consist of 6 frames in one packet. The maximum frame size is 1275
+ * bytes along with the largest possible packet header of 7 bytes. */
+ if ((ret = ff_alloc_packet(avctx, avpkt, (1275 * 6 + 7) * opus->stream_count)) < 0)
+ return ret;
+
+ if (avctx->sample_fmt == AV_SAMPLE_FMT_FLT)
+ ret = opus_multistream_encode_float(opus->enc, (const float *)audio,
+ opus->opts.packet_size,
+ avpkt->data, avpkt->size);
+ else
+ ret = opus_multistream_encode(opus->enc, (const opus_int16 *)audio,
+ opus->opts.packet_size,
+ avpkt->data, avpkt->size);
+
+ if (ret < 0) {
+ av_log(avctx, AV_LOG_ERROR,
+ "Error encoding frame: %s\n", opus_strerror(ret));
+ return ff_opus_error_to_averror(ret);
+ }
+
+ av_shrink_packet(avpkt, ret);
+
+ ff_af_queue_remove(&opus->afq, opus->opts.packet_size,
+ &avpkt->pts, &avpkt->duration);
+
+ discard_padding = opus->opts.packet_size - avpkt->duration;
+ // Check if subtraction resulted in an overflow
+ if ((discard_padding < opus->opts.packet_size) != (avpkt->duration > 0))
+ return AVERROR(EINVAL);
+ if (discard_padding > 0) {
+ uint8_t* side_data = av_packet_new_side_data(avpkt,
+ AV_PKT_DATA_SKIP_SAMPLES,
+ 10);
+ if (!side_data)
+ return AVERROR(ENOMEM);
+ AV_WL32(side_data + 4, discard_padding);
+ }
+
+ *got_packet_ptr = 1;
+
+ return 0;
+}
+
+static av_cold int libopus_encode_close(AVCodecContext *avctx)
+{
+ LibopusEncContext *opus = avctx->priv_data;
+
+ opus_multistream_encoder_destroy(opus->enc);
+
+ ff_af_queue_close(&opus->afq);
+
+ av_freep(&opus->samples);
+
+ return 0;
+}
+
+#define OFFSET(x) offsetof(LibopusEncContext, opts.x)
+#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+static const AVOption libopus_options[] = {
+ { "application", "Intended application type", OFFSET(application), AV_OPT_TYPE_INT, { .i64 = OPUS_APPLICATION_AUDIO }, OPUS_APPLICATION_VOIP, OPUS_APPLICATION_RESTRICTED_LOWDELAY, FLAGS, "application" },
+ { "voip", "Favor improved speech intelligibility", 0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_VOIP }, 0, 0, FLAGS, "application" },
+ { "audio", "Favor faithfulness to the input", 0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_AUDIO }, 0, 0, FLAGS, "application" },
+ { "lowdelay", "Restrict to only the lowest delay modes", 0, AV_OPT_TYPE_CONST, { .i64 = OPUS_APPLICATION_RESTRICTED_LOWDELAY }, 0, 0, FLAGS, "application" },
+ { "frame_duration", "Duration of a frame in milliseconds", OFFSET(frame_duration), AV_OPT_TYPE_FLOAT, { .dbl = 20.0 }, 2.5, 120.0, FLAGS },
+ { "packet_loss", "Expected packet loss percentage", OFFSET(packet_loss), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 100, FLAGS },
+ { "fec", "Enable inband FEC. Expected packet loss must be non-zero", OFFSET(fec), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
+ { "vbr", "Variable bit rate mode", OFFSET(vbr), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 2, FLAGS, "vbr" },
+ { "off", "Use constant bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FLAGS, "vbr" },
+ { "on", "Use variable bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "vbr" },
+ { "constrained", "Use constrained VBR", 0, AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0, FLAGS, "vbr" },
+ { "mapping_family", "Channel Mapping Family", OFFSET(mapping_family), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 255, FLAGS, "mapping_family" },
+ { "dtx", "Enable DTX", OFFSET(dtx), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
+#ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+ { "apply_phase_inv", "Apply intensity stereo phase inversion", OFFSET(apply_phase_inv), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
+#endif
+ { NULL },
+};
+
+static const AVClass libopus_class = {
+ .class_name = "libopus",
+ .item_name = av_default_item_name,
+ .option = libopus_options,
+ .version = LIBAVUTIL_VERSION_INT,
+};
+
+static const FFCodecDefault libopus_defaults[] = {
+ { "b", "0" },
+ { "compression_level", "10" },
+ { NULL },
+};
+
+static const int libopus_sample_rates[] = {
+ 48000, 24000, 16000, 12000, 8000, 0,
+};
+
+const FFCodec ff_libopus_encoder = {
+ .p.name = "libopus",
+ CODEC_LONG_NAME("libopus Opus"),
+ .p.type = AVMEDIA_TYPE_AUDIO,
+ .p.id = AV_CODEC_ID_OPUS,
+ .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+ AV_CODEC_CAP_SMALL_LAST_FRAME,
+ .caps_internal = FF_CODEC_CAP_NOT_INIT_THREADSAFE,
+ .priv_data_size = sizeof(LibopusEncContext),
+ .init = libopus_encode_init,
+ FF_CODEC_ENCODE_CB(libopus_encode),
+ .close = libopus_encode_close,
+ .p.sample_fmts = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
+ AV_SAMPLE_FMT_FLT,
+ AV_SAMPLE_FMT_NONE },
+ .p.supported_samplerates = libopus_sample_rates,
+ .p.priv_class = &libopus_class,
+ .defaults = libopus_defaults,
+ .p.wrapper_name = "libopus",
+};
diff --git a/media/ffvpx/libavcodec/libvorbisenc.c b/media/ffvpx/libavcodec/libvorbisenc.c
new file mode 100644
index 0000000000..6331cf0d79
--- /dev/null
+++ b/media/ffvpx/libavcodec/libvorbisenc.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2002 Mark Hills <mark@pogo.org.uk>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <vorbis/vorbisenc.h>
+
+#include "libavutil/avassert.h"
+#include "libavutil/channel_layout.h"
+#include "libavutil/fifo.h"
+#include "libavutil/opt.h"
+#include "avcodec.h"
+#include "audio_frame_queue.h"
+#include "codec_internal.h"
+#include "encode.h"
+#include "version.h"
+#include "vorbis_parser.h"
+
+
+/* Number of samples the user should send in each call.
+ * This value is used because it is the LCD of all possible frame sizes, so
+ * an output packet will always start at the same point as one of the input
+ * packets.
+ */
+#define LIBVORBIS_FRAME_SIZE 64
+
+#define BUFFER_SIZE (1024 * 64)
+
+typedef struct LibvorbisEncContext {
+ AVClass *av_class; /**< class for AVOptions */
+ vorbis_info vi; /**< vorbis_info used during init */
+ vorbis_dsp_state vd; /**< DSP state used for analysis */
+ vorbis_block vb; /**< vorbis_block used for analysis */
+ AVFifo *pkt_fifo; /**< output packet buffer */
+ int eof; /**< end-of-file flag */
+ int dsp_initialized; /**< vd has been initialized */
+ vorbis_comment vc; /**< VorbisComment info */
+ double iblock; /**< impulse block bias option */
+ AVVorbisParseContext *vp; /**< parse context to get durations */
+ AudioFrameQueue afq; /**< frame queue for timestamps */
+} LibvorbisEncContext;
+
+static const AVOption options[] = {
+ { "iblock", "Sets the impulse block bias", offsetof(LibvorbisEncContext, iblock), AV_OPT_TYPE_DOUBLE, { .dbl = 0 }, -15, 0, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
+ { NULL }
+};
+
+static const FFCodecDefault defaults[] = {
+ { "b", "0" },
+ { NULL },
+};
+
+static const AVClass vorbis_class = {
+ .class_name = "libvorbis",
+ .item_name = av_default_item_name,
+ .option = options,
+ .version = LIBAVUTIL_VERSION_INT,
+};
+
+static const uint8_t vorbis_encoding_channel_layout_offsets[8][8] = {
+ { 0 },
+ { 0, 1 },
+ { 0, 2, 1 },
+ { 0, 1, 2, 3 },
+ { 0, 2, 1, 3, 4 },
+ { 0, 2, 1, 4, 5, 3 },
+ { 0, 2, 1, 5, 6, 4, 3 },
+ { 0, 2, 1, 6, 7, 4, 5, 3 },
+};
+
+static int vorbis_error_to_averror(int ov_err)
+{
+ switch (ov_err) {
+ case OV_EFAULT: return AVERROR_BUG;
+ case OV_EINVAL: return AVERROR(EINVAL);
+ case OV_EIMPL: return AVERROR(EINVAL);
+ default: return AVERROR_UNKNOWN;
+ }
+}
+
+static av_cold int libvorbis_setup(vorbis_info *vi, AVCodecContext *avctx)
+{
+ LibvorbisEncContext *s = avctx->priv_data;
+ int channels = avctx->ch_layout.nb_channels;
+ double cfreq;
+ int ret;
+
+ if (avctx->flags & AV_CODEC_FLAG_QSCALE || !avctx->bit_rate) {
+ /* variable bitrate
+ * NOTE: we use the oggenc range of -1 to 10 for global_quality for
+ * user convenience, but libvorbis uses -0.1 to 1.0.
+ */
+ float q = avctx->global_quality / (float)FF_QP2LAMBDA;
+ /* default to 3 if the user did not set quality or bitrate */
+ if (!(avctx->flags & AV_CODEC_FLAG_QSCALE))
+ q = 3.0;
+ if ((ret = vorbis_encode_setup_vbr(vi, channels,
+ avctx->sample_rate,
+ q / 10.0)))
+ goto error;
+ } else {
+ int minrate = avctx->rc_min_rate > 0 ? avctx->rc_min_rate : -1;
+ int maxrate = avctx->rc_max_rate > 0 ? avctx->rc_max_rate : -1;
+
+ /* average bitrate */
+ if ((ret = vorbis_encode_setup_managed(vi, channels,
+ avctx->sample_rate, maxrate,
+ avctx->bit_rate, minrate)))
+ goto error;
+
+ /* variable bitrate by estimate, disable slow rate management */
+ if (minrate == -1 && maxrate == -1)
+ if ((ret = vorbis_encode_ctl(vi, OV_ECTL_RATEMANAGE2_SET, NULL)))
+ goto error; /* should not happen */
+ }
+
+ /* cutoff frequency */
+ if (avctx->cutoff > 0) {
+ cfreq = avctx->cutoff / 1000.0;
+ if ((ret = vorbis_encode_ctl(vi, OV_ECTL_LOWPASS_SET, &cfreq)))
+ goto error; /* should not happen */
+ }
+
+ /* impulse block bias */
+ if (s->iblock) {
+ if ((ret = vorbis_encode_ctl(vi, OV_ECTL_IBLOCK_SET, &s->iblock)))
+ goto error;
+ }
+
+ if ((channels == 3 &&
+ av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_SURROUND)) ||
+ (channels == 4 &&
+ av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_2_2) &&
+ av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_QUAD)) ||
+ (channels == 5 &&
+ av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT0) &&
+ av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT0_BACK)) ||
+ (channels == 6 &&
+ av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT1) &&
+ av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_5POINT1_BACK)) ||
+ (channels == 7 &&
+ av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_6POINT1)) ||
+ (channels == 8 &&
+ av_channel_layout_compare(&avctx->ch_layout, &(AVChannelLayout)AV_CHANNEL_LAYOUT_7POINT1))) {
+ if (avctx->ch_layout.order != AV_CHANNEL_ORDER_UNSPEC) {
+ char name[32];
+ av_channel_layout_describe(&avctx->ch_layout, name, sizeof(name));
+ av_log(avctx, AV_LOG_ERROR, "%s not supported by Vorbis: "
+ "output stream will have incorrect "
+ "channel layout.\n", name);
+ } else {
+ av_log(avctx, AV_LOG_WARNING, "No channel layout specified. The encoder "
+ "will use Vorbis channel layout for "
+ "%d channels.\n", channels);
+ }
+ }
+
+ if ((ret = vorbis_encode_setup_init(vi)))
+ goto error;
+
+ return 0;
+error:
+ return vorbis_error_to_averror(ret);
+}
+
+/* How many bytes are needed for a buffer of length 'l' */
+static int xiph_len(int l)
+{
+ return 1 + l / 255 + l;
+}
+
+static av_cold int libvorbis_encode_close(AVCodecContext *avctx)
+{
+ LibvorbisEncContext *s = avctx->priv_data;
+
+ /* notify vorbisenc this is EOF */
+ if (s->dsp_initialized)
+ vorbis_analysis_wrote(&s->vd, 0);
+
+ vorbis_block_clear(&s->vb);
+ vorbis_dsp_clear(&s->vd);
+ vorbis_info_clear(&s->vi);
+
+ av_fifo_freep2(&s->pkt_fifo);
+ ff_af_queue_close(&s->afq);
+
+ av_vorbis_parse_free(&s->vp);
+
+ return 0;
+}
+
+static av_cold int libvorbis_encode_init(AVCodecContext *avctx)
+{
+ LibvorbisEncContext *s = avctx->priv_data;
+ ogg_packet header, header_comm, header_code;
+ uint8_t *p;
+ unsigned int offset;
+ int ret;
+
+ vorbis_info_init(&s->vi);
+ if ((ret = libvorbis_setup(&s->vi, avctx))) {
+ av_log(avctx, AV_LOG_ERROR, "encoder setup failed\n");
+ goto error;
+ }
+ if ((ret = vorbis_analysis_init(&s->vd, &s->vi))) {
+ av_log(avctx, AV_LOG_ERROR, "analysis init failed\n");
+ ret = vorbis_error_to_averror(ret);
+ goto error;
+ }
+ s->dsp_initialized = 1;
+ if ((ret = vorbis_block_init(&s->vd, &s->vb))) {
+ av_log(avctx, AV_LOG_ERROR, "dsp init failed\n");
+ ret = vorbis_error_to_averror(ret);
+ goto error;
+ }
+
+ vorbis_comment_init(&s->vc);
+ if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT))
+ vorbis_comment_add_tag(&s->vc, "encoder", LIBAVCODEC_IDENT);
+
+ if ((ret = vorbis_analysis_headerout(&s->vd, &s->vc, &header, &header_comm,
+ &header_code))) {
+ ret = vorbis_error_to_averror(ret);
+ goto error;
+ }
+
+ avctx->extradata_size = 1 + xiph_len(header.bytes) +
+ xiph_len(header_comm.bytes) +
+ header_code.bytes;
+ p = avctx->extradata = av_malloc(avctx->extradata_size +
+ AV_INPUT_BUFFER_PADDING_SIZE);
+ if (!p) {
+ ret = AVERROR(ENOMEM);
+ goto error;
+ }
+ p[0] = 2;
+ offset = 1;
+ offset += av_xiphlacing(&p[offset], header.bytes);
+ offset += av_xiphlacing(&p[offset], header_comm.bytes);
+ memcpy(&p[offset], header.packet, header.bytes);
+ offset += header.bytes;
+ memcpy(&p[offset], header_comm.packet, header_comm.bytes);
+ offset += header_comm.bytes;
+ memcpy(&p[offset], header_code.packet, header_code.bytes);
+ offset += header_code.bytes;
+ av_assert0(offset == avctx->extradata_size);
+
+ s->vp = av_vorbis_parse_init(avctx->extradata, avctx->extradata_size);
+ if (!s->vp) {
+ av_log(avctx, AV_LOG_ERROR, "invalid extradata\n");
+ return ret;
+ }
+
+ vorbis_comment_clear(&s->vc);
+
+ avctx->frame_size = LIBVORBIS_FRAME_SIZE;
+ ff_af_queue_init(avctx, &s->afq);
+
+ s->pkt_fifo = av_fifo_alloc2(BUFFER_SIZE, 1, 0);
+ if (!s->pkt_fifo) {
+ ret = AVERROR(ENOMEM);
+ goto error;
+ }
+
+ return 0;
+error:
+ libvorbis_encode_close(avctx);
+ return ret;
+}
+
+static int libvorbis_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+ const AVFrame *frame, int *got_packet_ptr)
+{
+ LibvorbisEncContext *s = avctx->priv_data;
+ ogg_packet op;
+ int ret, duration;
+
+ /* send samples to libvorbis */
+ if (frame) {
+ const int samples = frame->nb_samples;
+ float **buffer;
+ int c, channels = s->vi.channels;
+
+ buffer = vorbis_analysis_buffer(&s->vd, samples);
+ for (c = 0; c < channels; c++) {
+ int co = (channels > 8) ? c :
+ vorbis_encoding_channel_layout_offsets[channels - 1][c];
+ memcpy(buffer[c], frame->extended_data[co],
+ samples * sizeof(*buffer[c]));
+ }
+ if ((ret = vorbis_analysis_wrote(&s->vd, samples)) < 0) {
+ av_log(avctx, AV_LOG_ERROR, "error in vorbis_analysis_wrote()\n");
+ return vorbis_error_to_averror(ret);
+ }
+ if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
+ return ret;
+ } else {
+ if (!s->eof && s->afq.frame_alloc)
+ if ((ret = vorbis_analysis_wrote(&s->vd, 0)) < 0) {
+ av_log(avctx, AV_LOG_ERROR, "error in vorbis_analysis_wrote()\n");
+ return vorbis_error_to_averror(ret);
+ }
+ s->eof = 1;
+ }
+
+ /* retrieve available packets from libvorbis */
+ while ((ret = vorbis_analysis_blockout(&s->vd, &s->vb)) == 1) {
+ if ((ret = vorbis_analysis(&s->vb, NULL)) < 0)
+ break;
+ if ((ret = vorbis_bitrate_addblock(&s->vb)) < 0)
+ break;
+
+ /* add any available packets to the output packet buffer */
+ while ((ret = vorbis_bitrate_flushpacket(&s->vd, &op)) == 1) {
+ if (av_fifo_can_write(s->pkt_fifo) < sizeof(ogg_packet) + op.bytes) {
+ av_log(avctx, AV_LOG_ERROR, "packet buffer is too small\n");
+ return AVERROR_BUG;
+ }
+ av_fifo_write(s->pkt_fifo, &op, sizeof(ogg_packet));
+ av_fifo_write(s->pkt_fifo, op.packet, op.bytes);
+ }
+ if (ret < 0) {
+ av_log(avctx, AV_LOG_ERROR, "error getting available packets\n");
+ break;
+ }
+ }
+ if (ret < 0) {
+ av_log(avctx, AV_LOG_ERROR, "error getting available packets\n");
+ return vorbis_error_to_averror(ret);
+ }
+
+ /* Read an available packet if possible */
+ if (av_fifo_read(s->pkt_fifo, &op, sizeof(ogg_packet)) < 0)
+ return 0;
+
+ if ((ret = ff_get_encode_buffer(avctx, avpkt, op.bytes, 0)) < 0)
+ return ret;
+ av_fifo_read(s->pkt_fifo, avpkt->data, op.bytes);
+
+ avpkt->pts = ff_samples_to_time_base(avctx, op.granulepos);
+
+ duration = av_vorbis_parse_frame(s->vp, avpkt->data, avpkt->size);
+ if (duration > 0) {
+ /* we do not know encoder delay until we get the first packet from
+ * libvorbis, so we have to update the AudioFrameQueue counts */
+ if (!avctx->initial_padding && s->afq.frames) {
+ avctx->initial_padding = duration;
+ av_assert0(!s->afq.remaining_delay);
+ s->afq.frames->duration += duration;
+ if (s->afq.frames->pts != AV_NOPTS_VALUE)
+ s->afq.frames->pts -= duration;
+ s->afq.remaining_samples += duration;
+ }
+ ff_af_queue_remove(&s->afq, duration, &avpkt->pts, &avpkt->duration);
+ }
+
+ *got_packet_ptr = 1;
+ return 0;
+}
+
+const FFCodec ff_libvorbis_encoder = {
+ .p.name = "libvorbis",
+ CODEC_LONG_NAME("libvorbis"),
+ .p.type = AVMEDIA_TYPE_AUDIO,
+ .p.id = AV_CODEC_ID_VORBIS,
+ .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+ AV_CODEC_CAP_SMALL_LAST_FRAME,
+ .caps_internal = FF_CODEC_CAP_NOT_INIT_THREADSAFE,
+ .priv_data_size = sizeof(LibvorbisEncContext),
+ .init = libvorbis_encode_init,
+ FF_CODEC_ENCODE_CB(libvorbis_encode_frame),
+ .close = libvorbis_encode_close,
+ .p.sample_fmts = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_FLTP,
+ AV_SAMPLE_FMT_NONE },
+ .p.priv_class = &vorbis_class,
+ .defaults = defaults,
+ .p.wrapper_name = "libvorbis",
+};
diff --git a/media/ffvpx/libavcodec/moz.build b/media/ffvpx/libavcodec/moz.build
index 0ba603d172..886fa7a2cb 100644
--- a/media/ffvpx/libavcodec/moz.build
+++ b/media/ffvpx/libavcodec/moz.build
@@ -20,6 +20,7 @@ LOCAL_INCLUDES += ['/modules/fdlibm/inexact-math-override']
SharedLibrary('mozavcodec')
SOURCES += [
'allcodecs.c',
+ 'audio_frame_queue.c',
'avcodec.c',
'avdct.c',
'avfft.c',
@@ -47,7 +48,9 @@ SOURCES += [
'jrevdct.c',
'libopus.c',
'libopusdec.c',
+ 'libopusenc.c',
'libvorbisdec.c',
+ 'libvorbisenc.c',
'log2_tab.c',
'mpegaudio.c',
'mpegaudiodata.c',
diff --git a/media/ffvpx/libavutil/avutil.symbols b/media/ffvpx/libavutil/avutil.symbols
index 0ad6fad9cd..5ee7afb855 100644
--- a/media/ffvpx/libavutil/avutil.symbols
+++ b/media/ffvpx/libavutil/avutil.symbols
@@ -92,6 +92,7 @@ av_fifo_alloc
av_fifo_alloc2
av_fifo_alloc_array
av_fifo_can_read
+av_fifo_can_write
av_fifo_drain
av_fifo_drain2
av_fifo_free
diff --git a/media/ffvpx/opusenc-dtx.patch b/media/ffvpx/opusenc-dtx.patch
new file mode 100644
index 0000000000..bf9fc9de87
--- /dev/null
+++ b/media/ffvpx/opusenc-dtx.patch
@@ -0,0 +1,63 @@
+diff --git a/media/ffvpx/libavcodec/libopusenc.c b/media/ffvpx/libavcodec/libopusenc.c
+--- a/media/ffvpx/libavcodec/libopusenc.c
++++ b/media/ffvpx/libavcodec/libopusenc.c
+@@ -37,16 +37,17 @@ typedef struct LibopusEncOpts {
+ int application;
+ int packet_loss;
+ int fec;
+ int complexity;
+ float frame_duration;
+ int packet_size;
+ int max_bandwidth;
+ int mapping_family;
++ int dtx;
+ #ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+ int apply_phase_inv;
+ #endif
+ } LibopusEncOpts;
+
+ typedef struct LibopusEncContext {
+ AVClass *class;
+ OpusMSEncoder *enc;
+@@ -154,16 +155,23 @@ static int libopus_configure_encoder(AVC
+
+ ret = opus_multistream_encoder_ctl(enc,
+ OPUS_SET_INBAND_FEC(opts->fec));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to set inband FEC: %s\n",
+ opus_strerror(ret));
+
++ ret = opus_multistream_encoder_ctl(enc,
++ OPUS_SET_DTX(opts->dtx));
++ if (ret != OPUS_OK)
++ av_log(avctx, AV_LOG_WARNING,
++ "Unable to set DTX: %s\n",
++ opus_strerror(ret));
++
+ if (avctx->cutoff) {
+ ret = opus_multistream_encoder_ctl(enc,
+ OPUS_SET_MAX_BANDWIDTH(opts->max_bandwidth));
+ if (ret != OPUS_OK)
+ av_log(avctx, AV_LOG_WARNING,
+ "Unable to set maximum bandwidth: %s\n", opus_strerror(ret));
+ }
+
+@@ -551,16 +559,17 @@ static const AVOption libopus_options[]
+ { "frame_duration", "Duration of a frame in milliseconds", OFFSET(frame_duration), AV_OPT_TYPE_FLOAT, { .dbl = 20.0 }, 2.5, 120.0, FLAGS },
+ { "packet_loss", "Expected packet loss percentage", OFFSET(packet_loss), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 100, FLAGS },
+ { "fec", "Enable inband FEC. Expected packet loss must be non-zero", OFFSET(fec), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
+ { "vbr", "Variable bit rate mode", OFFSET(vbr), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 2, FLAGS, "vbr" },
+ { "off", "Use constant bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FLAGS, "vbr" },
+ { "on", "Use variable bit rate", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "vbr" },
+ { "constrained", "Use constrained VBR", 0, AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0, FLAGS, "vbr" },
+ { "mapping_family", "Channel Mapping Family", OFFSET(mapping_family), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 255, FLAGS, "mapping_family" },
++ { "dtx", "Enable DTX", OFFSET(dtx), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
+ #ifdef OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST
+ { "apply_phase_inv", "Apply intensity stereo phase inversion", OFFSET(apply_phase_inv), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
+ #endif
+ { NULL },
+ };
+
+ static const AVClass libopus_class = {
+ .class_name = "libopus",
diff --git a/media/libaom/0001-errno.patch b/media/libaom/0001-errno.patch
new file mode 100644
index 0000000000..6040c42e38
--- /dev/null
+++ b/media/libaom/0001-errno.patch
@@ -0,0 +1,22 @@
+diff --git a/aom_util/aom_pthread.h b/aom/aom_util/aom_pthread.h
+--- a/aom_util/aom_pthread.h
++++ b/aom_util/aom_pthread.h
+@@ -30,16 +30,18 @@ extern "C" {
+ #define WIN32_LEAN_AND_MEAN
+ #include <process.h> // NOLINT
+ #include <stddef.h> // NOLINT
+ #include <windows.h> // NOLINT
+ typedef HANDLE pthread_t;
+ typedef int pthread_attr_t;
+ typedef CRITICAL_SECTION pthread_mutex_t;
+
++#include <errno.h>
++
+ #if _WIN32_WINNT < 0x0600
+ #error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
+ #endif
+ typedef CONDITION_VARIABLE pthread_cond_t;
+
+ #ifndef WINAPI_FAMILY_PARTITION
+ #define WINAPI_PARTITION_DESKTOP 1
+ #define WINAPI_FAMILY_PARTITION(x) x
diff --git a/media/libaom/0002-mmloadusi64.patch b/media/libaom/0002-mmloadusi64.patch
new file mode 100644
index 0000000000..9d23c90f22
--- /dev/null
+++ b/media/libaom/0002-mmloadusi64.patch
@@ -0,0 +1,79 @@
+diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
+--- a/aom_dsp/x86/synonyms.h
++++ b/aom_dsp/x86/synonyms.h
+@@ -41,22 +41,35 @@ static INLINE __m128i xx_loadl_64(const
+ static INLINE __m128i xx_load_128(const void *a) {
+ return _mm_load_si128((const __m128i *)a);
+ }
+
+ static INLINE __m128i xx_loadu_128(const void *a) {
+ return _mm_loadu_si128((const __m128i *)a);
+ }
+
++
++// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
++// manually on older compilers.
++#if !defined(__clang__) && __GNUC_MAJOR__ < 9
++static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
++ __m64 hi_, lo_;
++ memcpy(&hi_, hi, sizeof(hi_));
++ memcpy(&lo_, lo, sizeof(lo_));
++ return _mm_set_epi64(hi_, lo_);
++}
++#endif
++#else
+ // Load 64 bits from each of hi and low, and pack into an SSE register
+ // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
+ // the strict aliasing rule, this takes a different approach
+ static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
+ return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi));
+ }
++#endif
+
+ static INLINE void xx_storel_32(void *const a, const __m128i v) {
+ const int val = _mm_cvtsi128_si32(v);
+ memcpy(a, &val, sizeof(val));
+ }
+
+ static INLINE void xx_storel_64(void *const a, const __m128i v) {
+ _mm_storel_epi64((__m128i *)a, v);
+diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h
+--- a/aom_dsp/x86/synonyms_avx2.h
++++ b/aom_dsp/x86/synonyms_avx2.h
+@@ -66,21 +66,36 @@ static INLINE __m256i yy_set1_64_from_32
+
+ // Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
+ // therefore define an equivalent function using a different intrinsic.
+ // ([ hi ], [ lo ]) -> [ hi ][ lo ]
+ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+ }
+
++#define GCC_VERSION (__GNUC__ * 10000 \
++ + __GNUC_MINOR__ * 100 \
++ + __GNUC_PATCHLEVEL__)
++
++// _mm256_loadu2_m128i has been introduced in GCC 10.1
++#if !defined(__clang__) && GCC_VERSION < 101000
++static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
++ __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
++ __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
++ return _mm256_set_m128i(mhi, mlo);
++}
++#else
+ static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+ __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
+ __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
+ return yy_set_m128i(mhi, mlo);
+ }
++#endif
++
++#undef GCC_VERSION
+
+ static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
+ _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
+ _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
+ }
+
+ static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+ const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
diff --git a/media/libaom/config/generic/config/aom_config.asm b/media/libaom/config/generic/config/aom_config.asm
index be0715562c..0f329a7df5 100644
--- a/media/libaom/config/generic/config/aom_config.asm
+++ b/media/libaom/config/generic/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
CONFIG_OUTPUT_FRAME_SIZE equ 0
CONFIG_PARTITION_SEARCH_ORDER equ 0
CONFIG_PIC equ 0
+CONFIG_QUANT_MATRIX equ 1
CONFIG_RATECTRL_LOG equ 0
CONFIG_RD_COMMAND equ 0
CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 0
HAVE_SSE4_2 equ 0
HAVE_SSSE3 equ 0
HAVE_SVE equ 0
+HAVE_SVE2 equ 0
HAVE_VSX equ 0
HAVE_WXWIDGETS equ 0
STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/generic/config/aom_config.h b/media/libaom/config/generic/config/aom_config.h
index a695b0b3e6..c89e1d755c 100644
--- a/media/libaom/config/generic/config/aom_config.h
+++ b/media/libaom/config/generic/config/aom_config.h
@@ -55,6 +55,7 @@
#define CONFIG_OUTPUT_FRAME_SIZE 0
#define CONFIG_PARTITION_SEARCH_ORDER 0
#define CONFIG_PIC 0
+#define CONFIG_QUANT_MATRIX 1
#define CONFIG_RATECTRL_LOG 0
#define CONFIG_RD_COMMAND 0
#define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
#define HAVE_SSE4_2 0
#define HAVE_SSSE3 0
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
#define INLINE inline
diff --git a/media/libaom/config/generic/config/aom_dsp_rtcd.h b/media/libaom/config/generic/config/aom_dsp_rtcd.h
index 0418b3568e..a61dc47a47 100644
--- a/media/libaom/config/generic/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/generic/config/aom_dsp_rtcd.h
@@ -46,9 +46,15 @@ void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int
void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
#define aom_comp_mask_pred aom_comp_mask_pred_c
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+#define aom_compute_correlation aom_compute_correlation_c
+
void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
#define aom_compute_flow_at_point aom_compute_flow_at_point_c
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+#define aom_compute_mean_stddev aom_compute_mean_stddev_c
+
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define aom_convolve8 aom_convolve8_c
@@ -4693,9 +4699,6 @@ unsigned int aom_variance8x8_c(const uint8_t *src_ptr, int source_stride, const
int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl);
#define aom_vector_var aom_vector_var_c
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-#define av1_compute_cross_correlation av1_compute_cross_correlation_c
-
void aom_dsp_rtcd(void);
#include "config/aom_config.h"
diff --git a/media/libaom/config/generic/config/aom_scale_rtcd.h b/media/libaom/config/generic/config/aom_scale_rtcd.h
index 733b2d9ea1..dd09c4e3a6 100644
--- a/media/libaom/config/generic/config/aom_scale_rtcd.h
+++ b/media/libaom/config/generic/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
#define RTCD_EXTERN extern
#endif
+#include <stdbool.h>
+
struct yv12_buffer_config;
#ifdef __cplusplus
extern "C" {
#endif
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
#define aom_extend_frame_borders aom_extend_frame_borders_c
void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
#define aom_yv12_copy_frame aom_yv12_copy_frame_c
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_u aom_yv12_copy_u_c
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_v aom_yv12_copy_v_c
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
#define aom_yv12_copy_y aom_yv12_copy_y_c
void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
void aom_scale_rtcd(void);
diff --git a/media/libaom/config/linux/arm/config/aom_config.asm b/media/libaom/config/linux/arm/config/aom_config.asm
index 63034fd7e2..1ec673f263 100644
--- a/media/libaom/config/linux/arm/config/aom_config.asm
+++ b/media/libaom/config/linux/arm/config/aom_config.asm
@@ -53,6 +53,7 @@
.equ CONFIG_OUTPUT_FRAME_SIZE, 0
.equ CONFIG_PARTITION_SEARCH_ORDER, 0
.equ CONFIG_PIC, 1
+.equ CONFIG_QUANT_MATRIX, 1
.equ CONFIG_RATECTRL_LOG, 0
.equ CONFIG_RD_COMMAND, 0
.equ CONFIG_RD_DEBUG, 0
@@ -87,6 +88,7 @@
.equ HAVE_SSE4_2, 0
.equ HAVE_SSSE3, 0
.equ HAVE_SVE, 0
+.equ HAVE_SVE2, 0
.equ HAVE_VSX, 0
.equ HAVE_WXWIDGETS, 0
.equ STATIC_LINK_JXL, 0
diff --git a/media/libaom/config/linux/arm/config/aom_config.h b/media/libaom/config/linux/arm/config/aom_config.h
index 3cbe7bf169..fb73e8431e 100644
--- a/media/libaom/config/linux/arm/config/aom_config.h
+++ b/media/libaom/config/linux/arm/config/aom_config.h
@@ -55,6 +55,7 @@
#define CONFIG_OUTPUT_FRAME_SIZE 0
#define CONFIG_PARTITION_SEARCH_ORDER 0
#define CONFIG_PIC 1
+#define CONFIG_QUANT_MATRIX 1
#define CONFIG_RATECTRL_LOG 0
#define CONFIG_RD_COMMAND 0
#define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
#define HAVE_SSE4_2 0
#define HAVE_SSSE3 0
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
#define INLINE inline
diff --git a/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h b/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h
index 50ee78932c..fffcc5a3e9 100644
--- a/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h
@@ -54,10 +54,16 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, in
void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+#define aom_compute_correlation aom_compute_correlation_c
+
void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+#define aom_compute_mean_stddev aom_compute_mean_stddev_c
+
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define aom_convolve8 aom_convolve8_c
@@ -6212,9 +6218,6 @@ int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl);
int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl);
RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-#define av1_compute_cross_correlation av1_compute_cross_correlation_c
-
void aom_dsp_rtcd(void);
#include "config/aom_config.h"
diff --git a/media/libaom/config/linux/arm/config/aom_scale_rtcd.h b/media/libaom/config/linux/arm/config/aom_scale_rtcd.h
index d296957f84..1024a666fe 100644
--- a/media/libaom/config/linux/arm/config/aom_scale_rtcd.h
+++ b/media/libaom/config/linux/arm/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
#define RTCD_EXTERN extern
#endif
+#include <stdbool.h>
+
struct yv12_buffer_config;
#ifdef __cplusplus
extern "C" {
#endif
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
#define aom_extend_frame_borders aom_extend_frame_borders_c
void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
#define aom_yv12_copy_frame aom_yv12_copy_frame_c
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_u aom_yv12_copy_u_c
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_v aom_yv12_copy_v_c
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
#define aom_yv12_copy_y aom_yv12_copy_y_c
void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
void aom_scale_rtcd(void);
diff --git a/media/libaom/config/linux/ia32/config/aom_config.asm b/media/libaom/config/linux/ia32/config/aom_config.asm
index e75260cb09..4fd596e34b 100644
--- a/media/libaom/config/linux/ia32/config/aom_config.asm
+++ b/media/libaom/config/linux/ia32/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
CONFIG_OUTPUT_FRAME_SIZE equ 0
CONFIG_PARTITION_SEARCH_ORDER equ 0
CONFIG_PIC equ 1
+CONFIG_QUANT_MATRIX equ 1
CONFIG_RATECTRL_LOG equ 0
CONFIG_RD_COMMAND equ 0
CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1
HAVE_SSE4_2 equ 1
HAVE_SSSE3 equ 1
HAVE_SVE equ 0
+HAVE_SVE2 equ 0
HAVE_VSX equ 0
HAVE_WXWIDGETS equ 0
STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/linux/ia32/config/aom_config.h b/media/libaom/config/linux/ia32/config/aom_config.h
index b0e5b5cabc..256f556662 100644
--- a/media/libaom/config/linux/ia32/config/aom_config.h
+++ b/media/libaom/config/linux/ia32/config/aom_config.h
@@ -55,6 +55,7 @@
#define CONFIG_OUTPUT_FRAME_SIZE 0
#define CONFIG_PARTITION_SEARCH_ORDER 0
#define CONFIG_PIC 1
+#define CONFIG_QUANT_MATRIX 1
#define CONFIG_RATECTRL_LOG 0
#define CONFIG_RD_COMMAND 0
#define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
#define HAVE_SSE4_2 1
#define HAVE_SSSE3 1
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
#define INLINE inline
diff --git a/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h b/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h
index a19adf5f61..93472f0e92 100644
--- a/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h
@@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width
void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+
void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define aom_convolve8 aom_convolve8_c
void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -903,6 +912,7 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin
unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
@@ -5130,7 +5140,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const
RTCD_EXTERN unsigned int (*aom_sad16x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x3d aom_sad16x4x3d_c
+void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -5466,7 +5477,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u
#define aom_sad_skip_16x4 aom_sad_skip_16x4_c
void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c
+void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -5867,243 +5879,199 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, ui
#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -6326,11 +6294,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl);
int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl);
RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-
void aom_dsp_rtcd(void);
#ifdef RTCD_C
@@ -6360,14 +6323,19 @@ static void setup_rtcd_internal(void)
aom_comp_mask_pred = aom_comp_mask_pred_c;
if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
+ aom_compute_correlation = aom_compute_correlation_c;
+ if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
aom_compute_flow_at_point = aom_compute_flow_at_point_c;
if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
+ aom_compute_mean_stddev = aom_compute_mean_stddev_c;
+ if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
aom_convolve8_horiz = aom_convolve8_horiz_c;
- if (flags & HAS_SSE2) aom_convolve8_horiz = aom_convolve8_horiz_sse2;
if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
aom_convolve8_vert = aom_convolve8_vert_c;
- if (flags & HAS_SSE2) aom_convolve8_vert = aom_convolve8_vert_sse2;
if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
aom_convolve_copy = aom_convolve_copy_c;
@@ -6768,6 +6736,7 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_c;
if (flags & HAS_SSE2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
+ if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_c;
if (flags & HAS_SSE2) aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_sse2;
aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
@@ -8526,6 +8495,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) aom_sad16x4 = aom_sad16x4_sse2;
aom_sad16x4_avg = aom_sad16x4_avg_c;
if (flags & HAS_SSE2) aom_sad16x4_avg = aom_sad16x4_avg_sse2;
+ aom_sad16x4x3d = aom_sad16x4x3d_c;
+ if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
aom_sad16x4x4d = aom_sad16x4x4d_c;
if (flags & HAS_SSE2) aom_sad16x4x4d = aom_sad16x4x4d_sse2;
if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
@@ -8695,6 +8666,8 @@ static void setup_rtcd_internal(void)
aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_c;
if (flags & HAS_SSE2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
+ aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c;
+ if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2;
aom_sad_skip_16x64 = aom_sad_skip_16x64_c;
if (flags & HAS_SSE2) aom_sad_skip_16x64 = aom_sad_skip_16x64_sse2;
aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_c;
@@ -8897,157 +8870,113 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
aom_subtract_block = aom_subtract_block_c;
if (flags & HAS_SSE2) aom_subtract_block = aom_subtract_block_sse2;
@@ -9172,9 +9101,6 @@ static void setup_rtcd_internal(void)
aom_vector_var = aom_vector_var_c;
if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
- av1_compute_cross_correlation = av1_compute_cross_correlation_c;
- if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1;
- if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2;
}
#endif
diff --git a/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h b/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h
index 3b70fb47c3..cdabb21106 100644
--- a/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h
+++ b/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
#define RTCD_EXTERN extern
#endif
+#include <stdbool.h>
+
struct yv12_buffer_config;
#ifdef __cplusplus
extern "C" {
#endif
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
#define aom_extend_frame_borders aom_extend_frame_borders_c
void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
#define aom_yv12_copy_frame aom_yv12_copy_frame_c
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_u aom_yv12_copy_u_c
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_v aom_yv12_copy_v_c
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
#define aom_yv12_copy_y aom_yv12_copy_y_c
void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
void aom_scale_rtcd(void);
diff --git a/media/libaom/config/linux/ia32/config/av1_rtcd.h b/media/libaom/config/linux/ia32/config/av1_rtcd.h
index 3f404f61c8..37716517bf 100644
--- a/media/libaom/config/linux/ia32/config/av1_rtcd.h
+++ b/media/libaom/config/linux/ia32/config/av1_rtcd.h
@@ -265,7 +265,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds
#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -764,84 +763,72 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
@@ -969,7 +956,6 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) av1_convolve_y_sr = av1_convolve_y_sr_sse2;
if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
- if (flags & HAS_SSE2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2;
if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_c;
@@ -1176,62 +1162,50 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
- if (flags & HAS_SSE2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2;
if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3;
if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
- if (flags & HAS_SSE2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2;
if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3;
if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
cdef_filter_16_0 = cdef_filter_16_0_c;
- if (flags & HAS_SSE2) cdef_filter_16_0 = cdef_filter_16_0_sse2;
if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
cdef_filter_16_1 = cdef_filter_16_1_c;
- if (flags & HAS_SSE2) cdef_filter_16_1 = cdef_filter_16_1_sse2;
if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
cdef_filter_16_2 = cdef_filter_16_2_c;
- if (flags & HAS_SSE2) cdef_filter_16_2 = cdef_filter_16_2_sse2;
if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
cdef_filter_16_3 = cdef_filter_16_3_c;
- if (flags & HAS_SSE2) cdef_filter_16_3 = cdef_filter_16_3_sse2;
if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
cdef_filter_8_0 = cdef_filter_8_0_c;
- if (flags & HAS_SSE2) cdef_filter_8_0 = cdef_filter_8_0_sse2;
if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
cdef_filter_8_1 = cdef_filter_8_1_c;
- if (flags & HAS_SSE2) cdef_filter_8_1 = cdef_filter_8_1_sse2;
if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
cdef_filter_8_2 = cdef_filter_8_2_c;
- if (flags & HAS_SSE2) cdef_filter_8_2 = cdef_filter_8_2_sse2;
if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
cdef_filter_8_3 = cdef_filter_8_3_c;
- if (flags & HAS_SSE2) cdef_filter_8_3 = cdef_filter_8_3_sse2;
if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
cdef_find_dir = cdef_find_dir_c;
- if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
cdef_find_dir_dual = cdef_find_dir_dual_c;
- if (flags & HAS_SSE2) cdef_find_dir_dual = cdef_find_dir_dual_sse2;
if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3;
if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
diff --git a/media/libaom/config/linux/x64/config/aom_config.asm b/media/libaom/config/linux/x64/config/aom_config.asm
index f793ff3c6d..3f470f3a5f 100644
--- a/media/libaom/config/linux/x64/config/aom_config.asm
+++ b/media/libaom/config/linux/x64/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
CONFIG_OUTPUT_FRAME_SIZE equ 0
CONFIG_PARTITION_SEARCH_ORDER equ 0
CONFIG_PIC equ 0
+CONFIG_QUANT_MATRIX equ 1
CONFIG_RATECTRL_LOG equ 0
CONFIG_RD_COMMAND equ 0
CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1
HAVE_SSE4_2 equ 1
HAVE_SSSE3 equ 1
HAVE_SVE equ 0
+HAVE_SVE2 equ 0
HAVE_VSX equ 0
HAVE_WXWIDGETS equ 0
STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/linux/x64/config/aom_config.h b/media/libaom/config/linux/x64/config/aom_config.h
index 670d2ffe56..6d96b65b07 100644
--- a/media/libaom/config/linux/x64/config/aom_config.h
+++ b/media/libaom/config/linux/x64/config/aom_config.h
@@ -55,6 +55,7 @@
#define CONFIG_OUTPUT_FRAME_SIZE 0
#define CONFIG_PARTITION_SEARCH_ORDER 0
#define CONFIG_PIC 0
+#define CONFIG_QUANT_MATRIX 1
#define CONFIG_RATECTRL_LOG 0
#define CONFIG_RD_COMMAND 0
#define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
#define HAVE_SSE4_2 1
#define HAVE_SSSE3 1
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
#define INLINE inline
diff --git a/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h b/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h
index 8e979cc189..9135c6f423 100644
--- a/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h
@@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width
void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+
void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define aom_convolve8 aom_convolve8_c
void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -903,7 +912,8 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin
unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define aom_highbd_10_mse16x16 aom_highbd_10_mse16x16_sse2
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define aom_highbd_10_mse16x8 aom_highbd_10_mse16x8_c
@@ -5132,7 +5142,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const
#define aom_sad16x4_avg aom_sad16x4_avg_sse2
void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x3d aom_sad16x4x3d_c
+void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -5468,7 +5479,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u
#define aom_sad_skip_16x4 aom_sad_skip_16x4_c
void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c
+void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -5870,243 +5882,199 @@ void aom_ssim_parms_8x8_sse2(const uint8_t *s, int sp, const uint8_t *r, int rp,
#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2
uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -6329,11 +6297,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl);
int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl);
RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-
void aom_dsp_rtcd(void);
#ifdef RTCD_C
@@ -6358,12 +6321,19 @@ static void setup_rtcd_internal(void)
aom_comp_mask_pred = aom_comp_mask_pred_c;
if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
+ aom_compute_correlation = aom_compute_correlation_c;
+ if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
aom_compute_flow_at_point = aom_compute_flow_at_point_c;
if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
- aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+ if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
+ aom_compute_mean_stddev = aom_compute_mean_stddev_c;
+ if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
+ aom_convolve8_horiz = aom_convolve8_horiz_c;
if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
- aom_convolve8_vert = aom_convolve8_vert_sse2;
+ aom_convolve8_vert = aom_convolve8_vert_c;
if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
aom_convolve_copy = aom_convolve_copy_sse2;
@@ -6528,6 +6498,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_ssse3;
aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_c;
if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
+ aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
+ if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_sse4_1;
aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_c;
@@ -7626,6 +7598,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) aom_sad16x32x3d = aom_sad16x32x3d_avx2;
aom_sad16x32x4d = aom_sad16x32x4d_sse2;
if (flags & HAS_AVX2) aom_sad16x32x4d = aom_sad16x32x4d_avx2;
+ aom_sad16x4x3d = aom_sad16x4x3d_c;
+ if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
aom_sad16x4x4d = aom_sad16x4x4d_sse2;
if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
aom_sad16x64x3d = aom_sad16x64x3d_c;
@@ -7704,6 +7678,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_avx2;
aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
+ aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c;
+ if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2;
aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_sse2;
if (flags & HAS_AVX2) aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_avx2;
aom_sad_skip_16x8x4d = aom_sad_skip_16x8x4d_sse2;
@@ -7859,114 +7835,114 @@ static void setup_rtcd_internal(void)
aom_sse = aom_sse_c;
if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
- aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2;
+ aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
- aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2;
+ aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
- aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2;
+ aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
- aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2;
+ aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
- aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2;
+ aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
- aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2;
+ aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
- aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2;
+ aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
- aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2;
+ aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
- aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2;
+ aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
- aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2;
+ aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
- aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2;
+ aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
- aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2;
+ aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
- aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2;
+ aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
- aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2;
+ aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
- aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2;
+ aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
- aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2;
+ aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
- aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2;
+ aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
- aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2;
+ aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
- aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2;
+ aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
- aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2;
+ aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
- aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2;
+ aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
- aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2;
+ aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
- aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2;
+ aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
- aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2;
+ aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
- aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2;
+ aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
- aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2;
+ aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
- aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2;
+ aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
- aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2;
+ aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
- aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2;
+ aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
- aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2;
+ aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
- aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2;
+ aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
- aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2;
+ aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
- aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2;
+ aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
- aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2;
+ aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
- aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2;
+ aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
- aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2;
+ aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
- aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2;
+ aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
- aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2;
+ aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
- aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2;
+ aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
- aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2;
+ aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
- aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2;
+ aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
- aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2;
+ aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
- aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2;
+ aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
- aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2;
+ aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
aom_subtract_block = aom_subtract_block_sse2;
if (flags & HAS_AVX2) aom_subtract_block = aom_subtract_block_avx2;
@@ -8023,9 +7999,6 @@ static void setup_rtcd_internal(void)
aom_vector_var = aom_vector_var_c;
if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
- av1_compute_cross_correlation = av1_compute_cross_correlation_c;
- if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1;
- if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2;
}
#endif
diff --git a/media/libaom/config/linux/x64/config/aom_scale_rtcd.h b/media/libaom/config/linux/x64/config/aom_scale_rtcd.h
index 3b70fb47c3..cdabb21106 100644
--- a/media/libaom/config/linux/x64/config/aom_scale_rtcd.h
+++ b/media/libaom/config/linux/x64/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
#define RTCD_EXTERN extern
#endif
+#include <stdbool.h>
+
struct yv12_buffer_config;
#ifdef __cplusplus
extern "C" {
#endif
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
#define aom_extend_frame_borders aom_extend_frame_borders_c
void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
#define aom_yv12_copy_frame aom_yv12_copy_frame_c
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_u aom_yv12_copy_u_c
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_v aom_yv12_copy_v_c
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
#define aom_yv12_copy_y aom_yv12_copy_y_c
void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
void aom_scale_rtcd(void);
diff --git a/media/libaom/config/linux/x64/config/av1_rtcd.h b/media/libaom/config/linux/x64/config/av1_rtcd.h
index b1cdc99700..ad72985afe 100644
--- a/media/libaom/config/linux/x64/config/av1_rtcd.h
+++ b/media/libaom/config/linux/x64/config/av1_rtcd.h
@@ -253,7 +253,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds
#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -659,7 +658,6 @@ void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
-void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
RTCD_EXTERN void (*av1_lowbd_fwd_txfm)(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
@@ -755,85 +753,61 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
RTCD_EXTERN void (*cdef_find_dir_dual)(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
@@ -941,7 +915,7 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
av1_convolve_y_sr = av1_convolve_y_sr_sse2;
if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
- av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2;
+ av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2;
@@ -1091,7 +1065,7 @@ static void setup_rtcd_internal(void)
av1_inv_txfm_add = av1_inv_txfm_add_c;
if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
- av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse2;
+ av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_c;
if (flags & HAS_SSE4_1) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse4_1;
if (flags & HAS_AVX2) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_avx2;
av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_c;
@@ -1133,52 +1107,40 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_avx2;
av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
- cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2;
- if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3;
+ cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
- cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2;
- if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3;
+ cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
- cdef_filter_16_0 = cdef_filter_16_0_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3;
+ cdef_filter_16_0 = cdef_filter_16_0_c;
if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
- cdef_filter_16_1 = cdef_filter_16_1_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3;
+ cdef_filter_16_1 = cdef_filter_16_1_c;
if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
- cdef_filter_16_2 = cdef_filter_16_2_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3;
+ cdef_filter_16_2 = cdef_filter_16_2_c;
if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
- cdef_filter_16_3 = cdef_filter_16_3_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3;
+ cdef_filter_16_3 = cdef_filter_16_3_c;
if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
- cdef_filter_8_0 = cdef_filter_8_0_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3;
+ cdef_filter_8_0 = cdef_filter_8_0_c;
if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
- cdef_filter_8_1 = cdef_filter_8_1_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3;
+ cdef_filter_8_1 = cdef_filter_8_1_c;
if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
- cdef_filter_8_2 = cdef_filter_8_2_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3;
+ cdef_filter_8_2 = cdef_filter_8_2_c;
if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
- cdef_filter_8_3 = cdef_filter_8_3_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3;
+ cdef_filter_8_3 = cdef_filter_8_3_c;
if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
- cdef_find_dir = cdef_find_dir_sse2;
- if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+ cdef_find_dir = cdef_find_dir_c;
if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
- cdef_find_dir_dual = cdef_find_dir_dual_sse2;
- if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3;
+ cdef_find_dir_dual = cdef_find_dir_dual_c;
if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
diff --git a/media/libaom/config/mac/x64/config/aom_config.asm b/media/libaom/config/mac/x64/config/aom_config.asm
index f793ff3c6d..3f470f3a5f 100644
--- a/media/libaom/config/mac/x64/config/aom_config.asm
+++ b/media/libaom/config/mac/x64/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
CONFIG_OUTPUT_FRAME_SIZE equ 0
CONFIG_PARTITION_SEARCH_ORDER equ 0
CONFIG_PIC equ 0
+CONFIG_QUANT_MATRIX equ 1
CONFIG_RATECTRL_LOG equ 0
CONFIG_RD_COMMAND equ 0
CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1
HAVE_SSE4_2 equ 1
HAVE_SSSE3 equ 1
HAVE_SVE equ 0
+HAVE_SVE2 equ 0
HAVE_VSX equ 0
HAVE_WXWIDGETS equ 0
STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/mac/x64/config/aom_config.h b/media/libaom/config/mac/x64/config/aom_config.h
index 670d2ffe56..6d96b65b07 100644
--- a/media/libaom/config/mac/x64/config/aom_config.h
+++ b/media/libaom/config/mac/x64/config/aom_config.h
@@ -55,6 +55,7 @@
#define CONFIG_OUTPUT_FRAME_SIZE 0
#define CONFIG_PARTITION_SEARCH_ORDER 0
#define CONFIG_PIC 0
+#define CONFIG_QUANT_MATRIX 1
#define CONFIG_RATECTRL_LOG 0
#define CONFIG_RD_COMMAND 0
#define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
#define HAVE_SSE4_2 1
#define HAVE_SSSE3 1
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
#define INLINE inline
diff --git a/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h b/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h
index 8e979cc189..9135c6f423 100644
--- a/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h
@@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width
void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+
void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define aom_convolve8 aom_convolve8_c
void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -903,7 +912,8 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin
unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define aom_highbd_10_mse16x16 aom_highbd_10_mse16x16_sse2
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define aom_highbd_10_mse16x8 aom_highbd_10_mse16x8_c
@@ -5132,7 +5142,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const
#define aom_sad16x4_avg aom_sad16x4_avg_sse2
void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x3d aom_sad16x4x3d_c
+void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -5468,7 +5479,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u
#define aom_sad_skip_16x4 aom_sad_skip_16x4_c
void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c
+void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -5870,243 +5882,199 @@ void aom_ssim_parms_8x8_sse2(const uint8_t *s, int sp, const uint8_t *r, int rp,
#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2
uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -6329,11 +6297,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl);
int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl);
RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-
void aom_dsp_rtcd(void);
#ifdef RTCD_C
@@ -6358,12 +6321,19 @@ static void setup_rtcd_internal(void)
aom_comp_mask_pred = aom_comp_mask_pred_c;
if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
+ aom_compute_correlation = aom_compute_correlation_c;
+ if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
aom_compute_flow_at_point = aom_compute_flow_at_point_c;
if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
- aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+ if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
+ aom_compute_mean_stddev = aom_compute_mean_stddev_c;
+ if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
+ aom_convolve8_horiz = aom_convolve8_horiz_c;
if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
- aom_convolve8_vert = aom_convolve8_vert_sse2;
+ aom_convolve8_vert = aom_convolve8_vert_c;
if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
aom_convolve_copy = aom_convolve_copy_sse2;
@@ -6528,6 +6498,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_ssse3;
aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_c;
if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
+ aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
+ if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_sse4_1;
aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_c;
@@ -7626,6 +7598,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) aom_sad16x32x3d = aom_sad16x32x3d_avx2;
aom_sad16x32x4d = aom_sad16x32x4d_sse2;
if (flags & HAS_AVX2) aom_sad16x32x4d = aom_sad16x32x4d_avx2;
+ aom_sad16x4x3d = aom_sad16x4x3d_c;
+ if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
aom_sad16x4x4d = aom_sad16x4x4d_sse2;
if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
aom_sad16x64x3d = aom_sad16x64x3d_c;
@@ -7704,6 +7678,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_avx2;
aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
+ aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c;
+ if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2;
aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_sse2;
if (flags & HAS_AVX2) aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_avx2;
aom_sad_skip_16x8x4d = aom_sad_skip_16x8x4d_sse2;
@@ -7859,114 +7835,114 @@ static void setup_rtcd_internal(void)
aom_sse = aom_sse_c;
if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
- aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2;
+ aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
- aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2;
+ aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
- aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2;
+ aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
- aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2;
+ aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
- aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2;
+ aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
- aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2;
+ aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
- aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2;
+ aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
- aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2;
+ aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
- aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2;
+ aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
- aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2;
+ aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
- aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2;
+ aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
- aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2;
+ aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
- aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2;
+ aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
- aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2;
+ aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
- aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2;
+ aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
- aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2;
+ aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
- aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2;
+ aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
- aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2;
+ aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
- aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2;
+ aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
- aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2;
+ aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
- aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2;
+ aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
- aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2;
+ aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
- aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2;
+ aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
- aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2;
+ aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
- aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2;
+ aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
- aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2;
+ aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
- aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2;
+ aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
- aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2;
+ aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
- aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2;
+ aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
- aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2;
+ aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
- aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2;
+ aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
- aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2;
+ aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
- aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2;
+ aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
- aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2;
+ aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
- aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2;
+ aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
- aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2;
+ aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
- aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2;
+ aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
- aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2;
+ aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
- aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2;
+ aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
- aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2;
+ aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
- aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2;
+ aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
- aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2;
+ aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
- aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2;
+ aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
- aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2;
+ aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
aom_subtract_block = aom_subtract_block_sse2;
if (flags & HAS_AVX2) aom_subtract_block = aom_subtract_block_avx2;
@@ -8023,9 +7999,6 @@ static void setup_rtcd_internal(void)
aom_vector_var = aom_vector_var_c;
if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
- av1_compute_cross_correlation = av1_compute_cross_correlation_c;
- if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1;
- if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2;
}
#endif
diff --git a/media/libaom/config/mac/x64/config/aom_scale_rtcd.h b/media/libaom/config/mac/x64/config/aom_scale_rtcd.h
index 3b70fb47c3..cdabb21106 100644
--- a/media/libaom/config/mac/x64/config/aom_scale_rtcd.h
+++ b/media/libaom/config/mac/x64/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
#define RTCD_EXTERN extern
#endif
+#include <stdbool.h>
+
struct yv12_buffer_config;
#ifdef __cplusplus
extern "C" {
#endif
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
#define aom_extend_frame_borders aom_extend_frame_borders_c
void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
#define aom_yv12_copy_frame aom_yv12_copy_frame_c
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_u aom_yv12_copy_u_c
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_v aom_yv12_copy_v_c
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
#define aom_yv12_copy_y aom_yv12_copy_y_c
void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
void aom_scale_rtcd(void);
diff --git a/media/libaom/config/mac/x64/config/av1_rtcd.h b/media/libaom/config/mac/x64/config/av1_rtcd.h
index b1cdc99700..ad72985afe 100644
--- a/media/libaom/config/mac/x64/config/av1_rtcd.h
+++ b/media/libaom/config/mac/x64/config/av1_rtcd.h
@@ -253,7 +253,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds
#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -659,7 +658,6 @@ void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
-void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
RTCD_EXTERN void (*av1_lowbd_fwd_txfm)(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
@@ -755,85 +753,61 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
RTCD_EXTERN void (*cdef_find_dir_dual)(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
@@ -941,7 +915,7 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
av1_convolve_y_sr = av1_convolve_y_sr_sse2;
if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
- av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2;
+ av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2;
@@ -1091,7 +1065,7 @@ static void setup_rtcd_internal(void)
av1_inv_txfm_add = av1_inv_txfm_add_c;
if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
- av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse2;
+ av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_c;
if (flags & HAS_SSE4_1) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse4_1;
if (flags & HAS_AVX2) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_avx2;
av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_c;
@@ -1133,52 +1107,40 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_avx2;
av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
- cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2;
- if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3;
+ cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
- cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2;
- if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3;
+ cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
- cdef_filter_16_0 = cdef_filter_16_0_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3;
+ cdef_filter_16_0 = cdef_filter_16_0_c;
if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
- cdef_filter_16_1 = cdef_filter_16_1_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3;
+ cdef_filter_16_1 = cdef_filter_16_1_c;
if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
- cdef_filter_16_2 = cdef_filter_16_2_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3;
+ cdef_filter_16_2 = cdef_filter_16_2_c;
if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
- cdef_filter_16_3 = cdef_filter_16_3_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3;
+ cdef_filter_16_3 = cdef_filter_16_3_c;
if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
- cdef_filter_8_0 = cdef_filter_8_0_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3;
+ cdef_filter_8_0 = cdef_filter_8_0_c;
if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
- cdef_filter_8_1 = cdef_filter_8_1_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3;
+ cdef_filter_8_1 = cdef_filter_8_1_c;
if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
- cdef_filter_8_2 = cdef_filter_8_2_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3;
+ cdef_filter_8_2 = cdef_filter_8_2_c;
if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
- cdef_filter_8_3 = cdef_filter_8_3_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3;
+ cdef_filter_8_3 = cdef_filter_8_3_c;
if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
- cdef_find_dir = cdef_find_dir_sse2;
- if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+ cdef_find_dir = cdef_find_dir_c;
if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
- cdef_find_dir_dual = cdef_find_dir_dual_sse2;
- if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3;
+ cdef_find_dir_dual = cdef_find_dir_dual_c;
if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
diff --git a/media/libaom/config/win/ia32/config/aom_config.asm b/media/libaom/config/win/ia32/config/aom_config.asm
index af78328283..8f6c3592fa 100644
--- a/media/libaom/config/win/ia32/config/aom_config.asm
+++ b/media/libaom/config/win/ia32/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
CONFIG_OUTPUT_FRAME_SIZE equ 0
CONFIG_PARTITION_SEARCH_ORDER equ 0
CONFIG_PIC equ 0
+CONFIG_QUANT_MATRIX equ 1
CONFIG_RATECTRL_LOG equ 0
CONFIG_RD_COMMAND equ 0
CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1
HAVE_SSE4_2 equ 1
HAVE_SSSE3 equ 1
HAVE_SVE equ 0
+HAVE_SVE2 equ 0
HAVE_VSX equ 0
HAVE_WXWIDGETS equ 0
STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/win/ia32/config/aom_config.h b/media/libaom/config/win/ia32/config/aom_config.h
index dba805b1b6..7d1ce61373 100644
--- a/media/libaom/config/win/ia32/config/aom_config.h
+++ b/media/libaom/config/win/ia32/config/aom_config.h
@@ -55,6 +55,7 @@
#define CONFIG_OUTPUT_FRAME_SIZE 0
#define CONFIG_PARTITION_SEARCH_ORDER 0
#define CONFIG_PIC 0
+#define CONFIG_QUANT_MATRIX 1
#define CONFIG_RATECTRL_LOG 0
#define CONFIG_RD_COMMAND 0
#define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
#define HAVE_SSE4_2 1
#define HAVE_SSSE3 1
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
#define INLINE inline
diff --git a/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h b/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h
index a19adf5f61..93472f0e92 100644
--- a/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h
@@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width
void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+
void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define aom_convolve8 aom_convolve8_c
void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -903,6 +912,7 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin
unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
@@ -5130,7 +5140,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const
RTCD_EXTERN unsigned int (*aom_sad16x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x3d aom_sad16x4x3d_c
+void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -5466,7 +5477,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u
#define aom_sad_skip_16x4 aom_sad_skip_16x4_c
void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c
+void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -5867,243 +5879,199 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, ui
#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c
uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -6326,11 +6294,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl);
int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl);
RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-
void aom_dsp_rtcd(void);
#ifdef RTCD_C
@@ -6360,14 +6323,19 @@ static void setup_rtcd_internal(void)
aom_comp_mask_pred = aom_comp_mask_pred_c;
if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
+ aom_compute_correlation = aom_compute_correlation_c;
+ if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
aom_compute_flow_at_point = aom_compute_flow_at_point_c;
if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
+ aom_compute_mean_stddev = aom_compute_mean_stddev_c;
+ if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
aom_convolve8_horiz = aom_convolve8_horiz_c;
- if (flags & HAS_SSE2) aom_convolve8_horiz = aom_convolve8_horiz_sse2;
if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
aom_convolve8_vert = aom_convolve8_vert_c;
- if (flags & HAS_SSE2) aom_convolve8_vert = aom_convolve8_vert_sse2;
if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
aom_convolve_copy = aom_convolve_copy_c;
@@ -6768,6 +6736,7 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_c;
if (flags & HAS_SSE2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
+ if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_c;
if (flags & HAS_SSE2) aom_highbd_10_mse8x8 = aom_highbd_10_mse8x8_sse2;
aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
@@ -8526,6 +8495,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) aom_sad16x4 = aom_sad16x4_sse2;
aom_sad16x4_avg = aom_sad16x4_avg_c;
if (flags & HAS_SSE2) aom_sad16x4_avg = aom_sad16x4_avg_sse2;
+ aom_sad16x4x3d = aom_sad16x4x3d_c;
+ if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
aom_sad16x4x4d = aom_sad16x4x4d_c;
if (flags & HAS_SSE2) aom_sad16x4x4d = aom_sad16x4x4d_sse2;
if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
@@ -8695,6 +8666,8 @@ static void setup_rtcd_internal(void)
aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_c;
if (flags & HAS_SSE2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
+ aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c;
+ if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2;
aom_sad_skip_16x64 = aom_sad_skip_16x64_c;
if (flags & HAS_SSE2) aom_sad_skip_16x64 = aom_sad_skip_16x64_sse2;
aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_c;
@@ -8897,157 +8870,113 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
- if (flags & HAS_SSE2) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
aom_subtract_block = aom_subtract_block_c;
if (flags & HAS_SSE2) aom_subtract_block = aom_subtract_block_sse2;
@@ -9172,9 +9101,6 @@ static void setup_rtcd_internal(void)
aom_vector_var = aom_vector_var_c;
if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
- av1_compute_cross_correlation = av1_compute_cross_correlation_c;
- if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1;
- if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2;
}
#endif
diff --git a/media/libaom/config/win/ia32/config/aom_scale_rtcd.h b/media/libaom/config/win/ia32/config/aom_scale_rtcd.h
index 3b70fb47c3..cdabb21106 100644
--- a/media/libaom/config/win/ia32/config/aom_scale_rtcd.h
+++ b/media/libaom/config/win/ia32/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
#define RTCD_EXTERN extern
#endif
+#include <stdbool.h>
+
struct yv12_buffer_config;
#ifdef __cplusplus
extern "C" {
#endif
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
#define aom_extend_frame_borders aom_extend_frame_borders_c
void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
#define aom_yv12_copy_frame aom_yv12_copy_frame_c
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_u aom_yv12_copy_u_c
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_v aom_yv12_copy_v_c
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
#define aom_yv12_copy_y aom_yv12_copy_y_c
void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
void aom_scale_rtcd(void);
diff --git a/media/libaom/config/win/ia32/config/av1_rtcd.h b/media/libaom/config/win/ia32/config/av1_rtcd.h
index 3f404f61c8..37716517bf 100644
--- a/media/libaom/config/win/ia32/config/av1_rtcd.h
+++ b/media/libaom/config/win/ia32/config/av1_rtcd.h
@@ -265,7 +265,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds
#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -764,84 +763,72 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
@@ -969,7 +956,6 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) av1_convolve_y_sr = av1_convolve_y_sr_sse2;
if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
- if (flags & HAS_SSE2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2;
if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_c;
@@ -1176,62 +1162,50 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSE2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
- if (flags & HAS_SSE2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2;
if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3;
if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
- if (flags & HAS_SSE2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2;
if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3;
if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
cdef_filter_16_0 = cdef_filter_16_0_c;
- if (flags & HAS_SSE2) cdef_filter_16_0 = cdef_filter_16_0_sse2;
if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
cdef_filter_16_1 = cdef_filter_16_1_c;
- if (flags & HAS_SSE2) cdef_filter_16_1 = cdef_filter_16_1_sse2;
if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
cdef_filter_16_2 = cdef_filter_16_2_c;
- if (flags & HAS_SSE2) cdef_filter_16_2 = cdef_filter_16_2_sse2;
if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
cdef_filter_16_3 = cdef_filter_16_3_c;
- if (flags & HAS_SSE2) cdef_filter_16_3 = cdef_filter_16_3_sse2;
if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
cdef_filter_8_0 = cdef_filter_8_0_c;
- if (flags & HAS_SSE2) cdef_filter_8_0 = cdef_filter_8_0_sse2;
if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
cdef_filter_8_1 = cdef_filter_8_1_c;
- if (flags & HAS_SSE2) cdef_filter_8_1 = cdef_filter_8_1_sse2;
if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
cdef_filter_8_2 = cdef_filter_8_2_c;
- if (flags & HAS_SSE2) cdef_filter_8_2 = cdef_filter_8_2_sse2;
if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
cdef_filter_8_3 = cdef_filter_8_3_c;
- if (flags & HAS_SSE2) cdef_filter_8_3 = cdef_filter_8_3_sse2;
if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3;
if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
cdef_find_dir = cdef_find_dir_c;
- if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
cdef_find_dir_dual = cdef_find_dir_dual_c;
- if (flags & HAS_SSE2) cdef_find_dir_dual = cdef_find_dir_dual_sse2;
if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3;
if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
diff --git a/media/libaom/config/win/x64/config/aom_config.asm b/media/libaom/config/win/x64/config/aom_config.asm
index f793ff3c6d..3f470f3a5f 100644
--- a/media/libaom/config/win/x64/config/aom_config.asm
+++ b/media/libaom/config/win/x64/config/aom_config.asm
@@ -53,6 +53,7 @@ CONFIG_OS_SUPPORT equ 1
CONFIG_OUTPUT_FRAME_SIZE equ 0
CONFIG_PARTITION_SEARCH_ORDER equ 0
CONFIG_PIC equ 0
+CONFIG_QUANT_MATRIX equ 1
CONFIG_RATECTRL_LOG equ 0
CONFIG_RD_COMMAND equ 0
CONFIG_RD_DEBUG equ 0
@@ -87,6 +88,7 @@ HAVE_SSE4_1 equ 1
HAVE_SSE4_2 equ 1
HAVE_SSSE3 equ 1
HAVE_SVE equ 0
+HAVE_SVE2 equ 0
HAVE_VSX equ 0
HAVE_WXWIDGETS equ 0
STATIC_LINK_JXL equ 0
diff --git a/media/libaom/config/win/x64/config/aom_config.h b/media/libaom/config/win/x64/config/aom_config.h
index 670d2ffe56..6d96b65b07 100644
--- a/media/libaom/config/win/x64/config/aom_config.h
+++ b/media/libaom/config/win/x64/config/aom_config.h
@@ -55,6 +55,7 @@
#define CONFIG_OUTPUT_FRAME_SIZE 0
#define CONFIG_PARTITION_SEARCH_ORDER 0
#define CONFIG_PIC 0
+#define CONFIG_QUANT_MATRIX 1
#define CONFIG_RATECTRL_LOG 0
#define CONFIG_RD_COMMAND 0
#define CONFIG_RD_DEBUG 0
@@ -89,6 +90,7 @@
#define HAVE_SSE4_2 1
#define HAVE_SSSE3 1
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_VSX 0
#define HAVE_WXWIDGETS 0
#define INLINE inline
diff --git a/media/libaom/config/win/x64/config/aom_dsp_rtcd.h b/media/libaom/config/win/x64/config/aom_dsp_rtcd.h
index 8e979cc189..9135c6f423 100644
--- a/media/libaom/config/win/x64/config/aom_dsp_rtcd.h
+++ b/media/libaom/config/win/x64/config/aom_dsp_rtcd.h
@@ -57,21 +57,30 @@ void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width
void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
RTCD_EXTERN void (*aom_comp_mask_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+RTCD_EXTERN double (*aom_compute_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2);
+
void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
RTCD_EXTERN void (*aom_compute_flow_at_point)(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v);
+bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+RTCD_EXTERN bool (*aom_compute_mean_stddev)(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev);
+
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
#define aom_convolve8 aom_convolve8_c
void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
@@ -903,7 +912,8 @@ RTCD_EXTERN unsigned int (*aom_highbd_10_masked_sub_pixel_variance8x8)(const uin
unsigned int aom_highbd_10_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
-#define aom_highbd_10_mse16x16 aom_highbd_10_mse16x16_sse2
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*aom_highbd_10_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
unsigned int aom_highbd_10_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse);
#define aom_highbd_10_mse16x8 aom_highbd_10_mse16x8_c
@@ -5132,7 +5142,8 @@ unsigned int aom_sad16x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const
#define aom_sad16x4_avg aom_sad16x4_avg_sse2
void aom_sad16x4x3d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad16x4x3d aom_sad16x4x3d_c
+void aom_sad16x4x3d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad16x4x3d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
void aom_sad16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
void aom_sad16x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
@@ -5468,7 +5479,8 @@ unsigned int aom_sad_skip_16x4_c(const uint8_t *src_ptr, int src_stride, const u
#define aom_sad_skip_16x4 aom_sad_skip_16x4_c
void aom_sad_skip_16x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
-#define aom_sad_skip_16x4x4d aom_sad_skip_16x4x4d_c
+void aom_sad_skip_16x4x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*aom_sad_skip_16x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]);
unsigned int aom_sad_skip_16x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
unsigned int aom_sad_skip_16x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -5870,243 +5882,199 @@ void aom_ssim_parms_8x8_sse2(const uint8_t *s, int sp, const uint8_t *r, int rp,
#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2
uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t aom_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
RTCD_EXTERN uint32_t (*aom_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
uint32_t aom_sub_pixel_variance128x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance128x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance128x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance128x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x4_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance16x8_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance32x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance32x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance32x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x128_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x128_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x128)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t aom_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
uint32_t aom_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
RTCD_EXTERN uint32_t (*aom_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -6329,11 +6297,6 @@ int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl);
int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl);
RTCD_EXTERN int (*aom_vector_var)(const int16_t *ref, const int16_t *src, int bwl);
-double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-double av1_compute_cross_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-RTCD_EXTERN double (*av1_compute_cross_correlation)(const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2);
-
void aom_dsp_rtcd(void);
#ifdef RTCD_C
@@ -6358,12 +6321,19 @@ static void setup_rtcd_internal(void)
aom_comp_mask_pred = aom_comp_mask_pred_c;
if (flags & HAS_SSSE3) aom_comp_mask_pred = aom_comp_mask_pred_ssse3;
if (flags & HAS_AVX2) aom_comp_mask_pred = aom_comp_mask_pred_avx2;
+ aom_compute_correlation = aom_compute_correlation_c;
+ if (flags & HAS_SSE4_1) aom_compute_correlation = aom_compute_correlation_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_correlation = aom_compute_correlation_avx2;
aom_compute_flow_at_point = aom_compute_flow_at_point_c;
if (flags & HAS_SSE4_1) aom_compute_flow_at_point = aom_compute_flow_at_point_sse4_1;
- aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+ if (flags & HAS_AVX2) aom_compute_flow_at_point = aom_compute_flow_at_point_avx2;
+ aom_compute_mean_stddev = aom_compute_mean_stddev_c;
+ if (flags & HAS_SSE4_1) aom_compute_mean_stddev = aom_compute_mean_stddev_sse4_1;
+ if (flags & HAS_AVX2) aom_compute_mean_stddev = aom_compute_mean_stddev_avx2;
+ aom_convolve8_horiz = aom_convolve8_horiz_c;
if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
- aom_convolve8_vert = aom_convolve8_vert_sse2;
+ aom_convolve8_vert = aom_convolve8_vert_c;
if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
aom_convolve_copy = aom_convolve_copy_sse2;
@@ -6528,6 +6498,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x4 = aom_highbd_10_masked_sub_pixel_variance8x4_ssse3;
aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_c;
if (flags & HAS_SSSE3) aom_highbd_10_masked_sub_pixel_variance8x8 = aom_highbd_10_masked_sub_pixel_variance8x8_ssse3;
+ aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_sse2;
+ if (flags & HAS_AVX2) aom_highbd_10_mse16x16 = aom_highbd_10_mse16x16_avx2;
aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_c;
if (flags & HAS_SSE4_1) aom_highbd_10_obmc_variance128x128 = aom_highbd_10_obmc_variance128x128_sse4_1;
aom_highbd_10_obmc_variance128x64 = aom_highbd_10_obmc_variance128x64_c;
@@ -7626,6 +7598,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) aom_sad16x32x3d = aom_sad16x32x3d_avx2;
aom_sad16x32x4d = aom_sad16x32x4d_sse2;
if (flags & HAS_AVX2) aom_sad16x32x4d = aom_sad16x32x4d_avx2;
+ aom_sad16x4x3d = aom_sad16x4x3d_c;
+ if (flags & HAS_AVX2) aom_sad16x4x3d = aom_sad16x4x3d_avx2;
aom_sad16x4x4d = aom_sad16x4x4d_sse2;
if (flags & HAS_AVX2) aom_sad16x4x4d = aom_sad16x4x4d_avx2;
aom_sad16x64x3d = aom_sad16x64x3d_c;
@@ -7704,6 +7678,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) aom_sad_skip_16x16x4d = aom_sad_skip_16x16x4d_avx2;
aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_sse2;
if (flags & HAS_AVX2) aom_sad_skip_16x32x4d = aom_sad_skip_16x32x4d_avx2;
+ aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_c;
+ if (flags & HAS_AVX2) aom_sad_skip_16x4x4d = aom_sad_skip_16x4x4d_avx2;
aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_sse2;
if (flags & HAS_AVX2) aom_sad_skip_16x64x4d = aom_sad_skip_16x64x4d_avx2;
aom_sad_skip_16x8x4d = aom_sad_skip_16x8x4d_sse2;
@@ -7859,114 +7835,114 @@ static void setup_rtcd_internal(void)
aom_sse = aom_sse_c;
if (flags & HAS_SSE4_1) aom_sse = aom_sse_sse4_1;
if (flags & HAS_AVX2) aom_sse = aom_sse_avx2;
- aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_sse2;
+ aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x128 = aom_sub_pixel_avg_variance128x128_avx2;
- aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_sse2;
+ aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance128x64 = aom_sub_pixel_avg_variance128x64_avx2;
- aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_sse2;
+ aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x16 = aom_sub_pixel_avg_variance16x16_ssse3;
- aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_sse2;
+ aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x32 = aom_sub_pixel_avg_variance16x32_ssse3;
- aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_sse2;
+ aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x4 = aom_sub_pixel_avg_variance16x4_ssse3;
- aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_sse2;
+ aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x64 = aom_sub_pixel_avg_variance16x64_ssse3;
- aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_sse2;
+ aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance16x8 = aom_sub_pixel_avg_variance16x8_ssse3;
- aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_sse2;
+ aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x16 = aom_sub_pixel_avg_variance32x16_avx2;
- aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_sse2;
+ aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x32 = aom_sub_pixel_avg_variance32x32_avx2;
- aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_sse2;
+ aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance32x64 = aom_sub_pixel_avg_variance32x64_avx2;
- aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_sse2;
+ aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance32x8 = aom_sub_pixel_avg_variance32x8_ssse3;
- aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_sse2;
+ aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x16 = aom_sub_pixel_avg_variance4x16_ssse3;
- aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_sse2;
+ aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x4 = aom_sub_pixel_avg_variance4x4_ssse3;
- aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_sse2;
+ aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance4x8 = aom_sub_pixel_avg_variance4x8_ssse3;
- aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_sse2;
+ aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x128 = aom_sub_pixel_avg_variance64x128_avx2;
- aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_sse2;
+ aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x16 = aom_sub_pixel_avg_variance64x16_ssse3;
- aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_sse2;
+ aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x32 = aom_sub_pixel_avg_variance64x32_avx2;
- aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_sse2;
+ aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_avg_variance64x64 = aom_sub_pixel_avg_variance64x64_avx2;
- aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_sse2;
+ aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x16 = aom_sub_pixel_avg_variance8x16_ssse3;
- aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_sse2;
+ aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x32 = aom_sub_pixel_avg_variance8x32_ssse3;
- aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_sse2;
+ aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x4 = aom_sub_pixel_avg_variance8x4_ssse3;
- aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_sse2;
+ aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_avg_variance8x8 = aom_sub_pixel_avg_variance8x8_ssse3;
- aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_sse2;
+ aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance128x128 = aom_sub_pixel_variance128x128_avx2;
- aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_sse2;
+ aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance128x64 = aom_sub_pixel_variance128x64_avx2;
- aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_sse2;
+ aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x16 = aom_sub_pixel_variance16x16_avx2;
- aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_sse2;
+ aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x32 = aom_sub_pixel_variance16x32_avx2;
- aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_sse2;
+ aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x4 = aom_sub_pixel_variance16x4_avx2;
- aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_sse2;
+ aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x64 = aom_sub_pixel_variance16x64_avx2;
- aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_sse2;
+ aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance16x8 = aom_sub_pixel_variance16x8_avx2;
- aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_sse2;
+ aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x16 = aom_sub_pixel_variance32x16_avx2;
- aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_sse2;
+ aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x32 = aom_sub_pixel_variance32x32_avx2;
- aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_sse2;
+ aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance32x64 = aom_sub_pixel_variance32x64_avx2;
- aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_sse2;
+ aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance32x8 = aom_sub_pixel_variance32x8_ssse3;
- aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_sse2;
+ aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x16 = aom_sub_pixel_variance4x16_ssse3;
- aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_sse2;
+ aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x4 = aom_sub_pixel_variance4x4_ssse3;
- aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_sse2;
+ aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance4x8 = aom_sub_pixel_variance4x8_ssse3;
- aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_sse2;
+ aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x128 = aom_sub_pixel_variance64x128_avx2;
- aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_sse2;
+ aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x16 = aom_sub_pixel_variance64x16_ssse3;
- aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_sse2;
+ aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x32 = aom_sub_pixel_variance64x32_avx2;
- aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_sse2;
+ aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_ssse3;
if (flags & HAS_AVX2) aom_sub_pixel_variance64x64 = aom_sub_pixel_variance64x64_avx2;
- aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_sse2;
+ aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x16 = aom_sub_pixel_variance8x16_ssse3;
- aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_sse2;
+ aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x32 = aom_sub_pixel_variance8x32_ssse3;
- aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_sse2;
+ aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x4 = aom_sub_pixel_variance8x4_ssse3;
- aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_sse2;
+ aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_c;
if (flags & HAS_SSSE3) aom_sub_pixel_variance8x8 = aom_sub_pixel_variance8x8_ssse3;
aom_subtract_block = aom_subtract_block_sse2;
if (flags & HAS_AVX2) aom_subtract_block = aom_subtract_block_avx2;
@@ -8023,9 +7999,6 @@ static void setup_rtcd_internal(void)
aom_vector_var = aom_vector_var_c;
if (flags & HAS_SSE4_1) aom_vector_var = aom_vector_var_sse4_1;
if (flags & HAS_AVX2) aom_vector_var = aom_vector_var_avx2;
- av1_compute_cross_correlation = av1_compute_cross_correlation_c;
- if (flags & HAS_SSE4_1) av1_compute_cross_correlation = av1_compute_cross_correlation_sse4_1;
- if (flags & HAS_AVX2) av1_compute_cross_correlation = av1_compute_cross_correlation_avx2;
}
#endif
diff --git a/media/libaom/config/win/x64/config/aom_scale_rtcd.h b/media/libaom/config/win/x64/config/aom_scale_rtcd.h
index 3b70fb47c3..cdabb21106 100644
--- a/media/libaom/config/win/x64/config/aom_scale_rtcd.h
+++ b/media/libaom/config/win/x64/config/aom_scale_rtcd.h
@@ -8,13 +8,15 @@
#define RTCD_EXTERN extern
#endif
+#include <stdbool.h>
+
struct yv12_buffer_config;
#ifdef __cplusplus
extern "C" {
#endif
-void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, int num_planes);
#define aom_extend_frame_borders aom_extend_frame_borders_c
void aom_extend_frame_borders_plane_row_c(const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end);
@@ -50,13 +52,13 @@ void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigne
void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
#define aom_yv12_copy_frame aom_yv12_copy_frame_c
-void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_u aom_yv12_copy_u_c
-void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop);
#define aom_yv12_copy_v aom_yv12_copy_v_c
-void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop);
#define aom_yv12_copy_y aom_yv12_copy_y_c
void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
@@ -80,7 +82,7 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
-int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes);
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes);
#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
void aom_scale_rtcd(void);
diff --git a/media/libaom/config/win/x64/config/av1_rtcd.h b/media/libaom/config/win/x64/config/av1_rtcd.h
index b1cdc99700..ad72985afe 100644
--- a/media/libaom/config/win/x64/config/av1_rtcd.h
+++ b/media/libaom/config/win/x64/config/av1_rtcd.h
@@ -253,7 +253,6 @@ void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *ds
#define av1_convolve_y_sr_intrabc av1_convolve_y_sr_intrabc_c
void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
-void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
RTCD_EXTERN void (*av1_dist_wtd_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params);
@@ -659,7 +658,6 @@ void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
-void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
RTCD_EXTERN void (*av1_lowbd_fwd_txfm)(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param);
@@ -755,85 +753,61 @@ void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params);
void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
RTCD_EXTERN void (*cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
-void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
RTCD_EXTERN void (*cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height);
void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_0_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_0_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_0)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_1_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_1_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_1)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_2_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_2_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_2)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_sse2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_16_3_ssse3(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_sse4_1(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_16_3_avx2(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_16_3)(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_0_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_0_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_0)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_1_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_1_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_1)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_2_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_2_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_2)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_sse2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
-void cdef_filter_8_3_ssse3(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_sse4_1(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
void cdef_filter_8_3_avx2(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
RTCD_EXTERN void (*cdef_filter_8_3)(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height);
int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
-int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
-void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
RTCD_EXTERN void (*cdef_find_dir_dual)(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2);
@@ -941,7 +915,7 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
av1_convolve_y_sr = av1_convolve_y_sr_sse2;
if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
- av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_sse2;
+ av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_c;
if (flags & HAS_SSSE3) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_ssse3;
if (flags & HAS_AVX2) av1_dist_wtd_convolve_2d = av1_dist_wtd_convolve_2d_avx2;
av1_dist_wtd_convolve_2d_copy = av1_dist_wtd_convolve_2d_copy_sse2;
@@ -1091,7 +1065,7 @@ static void setup_rtcd_internal(void)
av1_inv_txfm_add = av1_inv_txfm_add_c;
if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
- av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse2;
+ av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_c;
if (flags & HAS_SSE4_1) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_sse4_1;
if (flags & HAS_AVX2) av1_lowbd_fwd_txfm = av1_lowbd_fwd_txfm_avx2;
av1_lowbd_pixel_proj_error = av1_lowbd_pixel_proj_error_c;
@@ -1133,52 +1107,40 @@ static void setup_rtcd_internal(void)
if (flags & HAS_AVX2) av1_wedge_sse_from_residuals = av1_wedge_sse_from_residuals_avx2;
av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
- cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse2;
- if (flags & HAS_SSSE3) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_ssse3;
+ cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_c;
if (flags & HAS_SSE4_1) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_sse4_1;
if (flags & HAS_AVX2) cdef_copy_rect8_16bit_to_16bit = cdef_copy_rect8_16bit_to_16bit_avx2;
- cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse2;
- if (flags & HAS_SSSE3) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_ssse3;
+ cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_c;
if (flags & HAS_SSE4_1) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_sse4_1;
if (flags & HAS_AVX2) cdef_copy_rect8_8bit_to_16bit = cdef_copy_rect8_8bit_to_16bit_avx2;
- cdef_filter_16_0 = cdef_filter_16_0_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_0 = cdef_filter_16_0_ssse3;
+ cdef_filter_16_0 = cdef_filter_16_0_c;
if (flags & HAS_SSE4_1) cdef_filter_16_0 = cdef_filter_16_0_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_0 = cdef_filter_16_0_avx2;
- cdef_filter_16_1 = cdef_filter_16_1_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_1 = cdef_filter_16_1_ssse3;
+ cdef_filter_16_1 = cdef_filter_16_1_c;
if (flags & HAS_SSE4_1) cdef_filter_16_1 = cdef_filter_16_1_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_1 = cdef_filter_16_1_avx2;
- cdef_filter_16_2 = cdef_filter_16_2_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_2 = cdef_filter_16_2_ssse3;
+ cdef_filter_16_2 = cdef_filter_16_2_c;
if (flags & HAS_SSE4_1) cdef_filter_16_2 = cdef_filter_16_2_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_2 = cdef_filter_16_2_avx2;
- cdef_filter_16_3 = cdef_filter_16_3_sse2;
- if (flags & HAS_SSSE3) cdef_filter_16_3 = cdef_filter_16_3_ssse3;
+ cdef_filter_16_3 = cdef_filter_16_3_c;
if (flags & HAS_SSE4_1) cdef_filter_16_3 = cdef_filter_16_3_sse4_1;
if (flags & HAS_AVX2) cdef_filter_16_3 = cdef_filter_16_3_avx2;
- cdef_filter_8_0 = cdef_filter_8_0_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_0 = cdef_filter_8_0_ssse3;
+ cdef_filter_8_0 = cdef_filter_8_0_c;
if (flags & HAS_SSE4_1) cdef_filter_8_0 = cdef_filter_8_0_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_0 = cdef_filter_8_0_avx2;
- cdef_filter_8_1 = cdef_filter_8_1_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_1 = cdef_filter_8_1_ssse3;
+ cdef_filter_8_1 = cdef_filter_8_1_c;
if (flags & HAS_SSE4_1) cdef_filter_8_1 = cdef_filter_8_1_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_1 = cdef_filter_8_1_avx2;
- cdef_filter_8_2 = cdef_filter_8_2_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_2 = cdef_filter_8_2_ssse3;
+ cdef_filter_8_2 = cdef_filter_8_2_c;
if (flags & HAS_SSE4_1) cdef_filter_8_2 = cdef_filter_8_2_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_2 = cdef_filter_8_2_avx2;
- cdef_filter_8_3 = cdef_filter_8_3_sse2;
- if (flags & HAS_SSSE3) cdef_filter_8_3 = cdef_filter_8_3_ssse3;
+ cdef_filter_8_3 = cdef_filter_8_3_c;
if (flags & HAS_SSE4_1) cdef_filter_8_3 = cdef_filter_8_3_sse4_1;
if (flags & HAS_AVX2) cdef_filter_8_3 = cdef_filter_8_3_avx2;
- cdef_find_dir = cdef_find_dir_sse2;
- if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+ cdef_find_dir = cdef_find_dir_c;
if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
- cdef_find_dir_dual = cdef_find_dir_dual_sse2;
- if (flags & HAS_SSSE3) cdef_find_dir_dual = cdef_find_dir_dual_ssse3;
+ cdef_find_dir_dual = cdef_find_dir_dual_c;
if (flags & HAS_SSE4_1) cdef_find_dir_dual = cdef_find_dir_dual_sse4_1;
if (flags & HAS_AVX2) cdef_find_dir_dual = cdef_find_dir_dual_avx2;
cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
diff --git a/media/libaom/moz.yaml b/media/libaom/moz.yaml
index b06ee5115a..a37ab1e904 100644
--- a/media/libaom/moz.yaml
+++ b/media/libaom/moz.yaml
@@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
- release: 11631186b36e96afce18808ebebb17cc23a010ef (Fri Jan 19 23:29:34 2024 +0000).
+ release: 879d14159441796c92f3bbba7f8965e1bcf320ca (Tue Apr 02 21:57:54 2024 +0000).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
- revision: 11631186b36e96afce18808ebebb17cc23a010ef
+ revision: 879d14159441796c92f3bbba7f8965e1bcf320ca
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/
@@ -54,3 +54,7 @@ vendoring:
- action: run-script
script: '{yaml_dir}/generate_sources_mozbuild.sh'
cwd: '{yaml_dir}'
+
+ patches:
+ - 0001-errno.patch
+ - 0002-mmloadusi64.patch
diff --git a/media/libaom/sources.mozbuild b/media/libaom/sources.mozbuild
index b29ddd5c97..187bf97f8a 100644
--- a/media/libaom/sources.mozbuild
+++ b/media/libaom/sources.mozbuild
@@ -506,6 +506,7 @@ files = {
'../../third_party/aom/aom_dsp/flow_estimation/ransac.c',
'../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c',
'../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c',
+ '../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c',
'../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c',
'../../third_party/aom/aom_dsp/fwd_txfm.c',
'../../third_party/aom/aom_dsp/grain_table.c',
@@ -533,11 +534,8 @@ files = {
'../../third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm',
'../../third_party/aom/aom_dsp/x86/aom_quantize_avx.c',
'../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c',
- '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c',
'../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c',
- '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm',
'../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm',
- '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm',
'../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm',
'../../third_party/aom/aom_dsp/x86/avg_intrin_avx2.c',
'../../third_party/aom/aom_dsp/x86/avg_intrin_sse2.c',
@@ -599,7 +597,7 @@ files = {
'../../third_party/aom/aom_dsp/x86/sad_sse2.asm',
'../../third_party/aom/aom_dsp/x86/sse_avx2.c',
'../../third_party/aom/aom_dsp/x86/sse_sse4.c',
- '../../third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm',
+ '../../third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm',
'../../third_party/aom/aom_dsp/x86/subtract_avx2.c',
'../../third_party/aom/aom_dsp/x86/subtract_sse2.asm',
'../../third_party/aom/aom_dsp/x86/sum_squares_avx2.c',
@@ -658,7 +656,6 @@ files = {
'../../third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c',
'../../third_party/aom/av1/common/x86/av1_txfm_sse4.c',
'../../third_party/aom/av1/common/x86/cdef_block_avx2.c',
- '../../third_party/aom/av1/common/x86/cdef_block_sse2.c',
'../../third_party/aom/av1/common/x86/cdef_block_sse4.c',
'../../third_party/aom/av1/common/x86/cdef_block_ssse3.c',
'../../third_party/aom/av1/common/x86/cfl_avx2.c',
@@ -859,6 +856,7 @@ files = {
'../../third_party/aom/aom_dsp/flow_estimation/ransac.c',
'../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c',
'../../third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c',
+ '../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_avx2.c',
'../../third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c',
'../../third_party/aom/aom_dsp/fwd_txfm.c',
'../../third_party/aom/aom_dsp/grain_table.c',
@@ -886,11 +884,8 @@ files = {
'../../third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm',
'../../third_party/aom/aom_dsp/x86/aom_quantize_avx.c',
'../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c',
- '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c',
'../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c',
- '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm',
'../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm',
- '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm',
'../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm',
'../../third_party/aom/aom_dsp/x86/avg_intrin_avx2.c',
'../../third_party/aom/aom_dsp/x86/avg_intrin_sse2.c',
@@ -955,7 +950,7 @@ files = {
'../../third_party/aom/aom_dsp/x86/sse_avx2.c',
'../../third_party/aom/aom_dsp/x86/sse_sse4.c',
'../../third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm',
- '../../third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm',
+ '../../third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm',
'../../third_party/aom/aom_dsp/x86/subtract_avx2.c',
'../../third_party/aom/aom_dsp/x86/subtract_sse2.asm',
'../../third_party/aom/aom_dsp/x86/sum_squares_avx2.c',
@@ -1014,9 +1009,7 @@ files = {
'../../third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c',
'../../third_party/aom/av1/common/x86/av1_txfm_sse4.c',
'../../third_party/aom/av1/common/x86/cdef_block_avx2.c',
- '../../third_party/aom/av1/common/x86/cdef_block_sse2.c',
'../../third_party/aom/av1/common/x86/cdef_block_sse4.c',
- '../../third_party/aom/av1/common/x86/cdef_block_ssse3.c',
'../../third_party/aom/av1/common/x86/cfl_avx2.c',
'../../third_party/aom/av1/common/x86/cfl_sse2.c',
'../../third_party/aom/av1/common/x86/cfl_ssse3.c',
diff --git a/media/libcubeb/0004-audiounit-ios-compile-fixes.patch b/media/libcubeb/0004-audiounit-ios-compile-fixes.patch
new file mode 100644
index 0000000000..465ae0f98a
--- /dev/null
+++ b/media/libcubeb/0004-audiounit-ios-compile-fixes.patch
@@ -0,0 +1,1415 @@
+diff --git a/src/cubeb_audiounit.cpp b/src/cubeb_audiounit.cpp
+--- a/src/cubeb_audiounit.cpp
++++ b/src/cubeb_audiounit.cpp
+@@ -36,16 +36,25 @@
+ #include <vector>
+
+ using namespace std;
+
+ #if MAC_OS_X_VERSION_MIN_REQUIRED < 101000
+ typedef UInt32 AudioFormatFlags;
+ #endif
+
++#if TARGET_OS_IPHONE
++typedef UInt32 AudioDeviceID;
++typedef UInt32 AudioObjectID;
++const UInt32 kAudioObjectUnknown = 0;
++
++#define AudioGetCurrentHostTime mach_absolute_time
++
++#endif
++
+ #define AU_OUT_BUS 0
+ #define AU_IN_BUS 1
+
+ const char * DISPATCH_QUEUE_LABEL = "org.mozilla.cubeb";
+ const char * PRIVATE_AGGREGATE_DEVICE_NAME = "CubebAggregateDevice";
+
+ #ifdef ALOGV
+ #undef ALOGV
+@@ -60,45 +69,47 @@ const char * PRIVATE_AGGREGATE_DEVICE_NA
+ #undef ALOG
+ #endif
+ #define ALOG(msg, ...) \
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_HIGH, 0), \
+ ^{ \
+ LOG(msg, ##__VA_ARGS__); \
+ })
+
++#if !TARGET_OS_IPHONE
+ /* Testing empirically, some headsets report a minimal latency that is very
+ * low, but this does not work in practice. Lie and say the minimum is 256
+ * frames. */
+ const uint32_t SAFE_MIN_LATENCY_FRAMES = 128;
+ const uint32_t SAFE_MAX_LATENCY_FRAMES = 512;
+
+ const AudioObjectPropertyAddress DEFAULT_INPUT_DEVICE_PROPERTY_ADDRESS = {
+ kAudioHardwarePropertyDefaultInputDevice, kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ const AudioObjectPropertyAddress DEFAULT_OUTPUT_DEVICE_PROPERTY_ADDRESS = {
+ kAudioHardwarePropertyDefaultOutputDevice, kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ const AudioObjectPropertyAddress DEVICE_IS_ALIVE_PROPERTY_ADDRESS = {
+ kAudioDevicePropertyDeviceIsAlive, kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ const AudioObjectPropertyAddress DEVICES_PROPERTY_ADDRESS = {
+ kAudioHardwarePropertyDevices, kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ const AudioObjectPropertyAddress INPUT_DATA_SOURCE_PROPERTY_ADDRESS = {
+ kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeInput,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ const AudioObjectPropertyAddress OUTPUT_DATA_SOURCE_PROPERTY_ADDRESS = {
+ kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeOutput,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
++#endif
+
+ typedef uint32_t device_flags_value;
+
+ enum device_flags {
+ DEV_UNKNOWN = 0x00, /* Unknown */
+ DEV_INPUT = 0x01, /* Record device like mic */
+ DEV_OUTPUT = 0x02, /* Playback device like speakers */
+ DEV_SYSTEM_DEFAULT = 0x04, /* System default device */
+@@ -109,49 +120,51 @@ enum device_flags {
+ void
+ audiounit_stream_stop_internal(cubeb_stream * stm);
+ static int
+ audiounit_stream_start_internal(cubeb_stream * stm);
+ static void
+ audiounit_close_stream(cubeb_stream * stm);
+ static int
+ audiounit_setup_stream(cubeb_stream * stm);
++#if !TARGET_OS_IPHONE
+ static vector<AudioObjectID>
+ audiounit_get_devices_of_type(cubeb_device_type devtype);
+ static UInt32
+ audiounit_get_device_presentation_latency(AudioObjectID devid,
+ AudioObjectPropertyScope scope);
+-
+-#if !TARGET_OS_IPHONE
+ static AudioObjectID
+ audiounit_get_default_device_id(cubeb_device_type type);
+ static int
+ audiounit_uninstall_device_changed_callback(cubeb_stream * stm);
+ static int
+ audiounit_uninstall_system_changed_callback(cubeb_stream * stm);
++#endif
++
+ static void
+ audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags);
+-#endif
+
+ extern cubeb_ops const audiounit_ops;
+
+ struct cubeb {
+ cubeb_ops const * ops = &audiounit_ops;
+ owned_critical_section mutex;
+ int active_streams = 0;
+ uint32_t global_latency_frames = 0;
+ cubeb_device_collection_changed_callback input_collection_changed_callback =
+ nullptr;
+ void * input_collection_changed_user_ptr = nullptr;
+ cubeb_device_collection_changed_callback output_collection_changed_callback =
+ nullptr;
+ void * output_collection_changed_user_ptr = nullptr;
++ #if !TARGET_OS_IPHONE
+ // Store list of devices to detect changes
+ vector<AudioObjectID> input_device_array;
+ vector<AudioObjectID> output_device_array;
++ #endif
+ // The queue should be released when it’s no longer needed.
+ dispatch_queue_t serial_queue =
+ dispatch_queue_create(DISPATCH_QUEUE_LABEL, DISPATCH_QUEUE_SERIAL);
+ // Current used channel layout
+ atomic<cubeb_channel_layout> layout{CUBEB_LAYOUT_UNDEFINED};
+ uint32_t channels = 0;
+ };
+
+@@ -181,29 +194,31 @@ to_string(io_side side)
+ }
+ }
+
+ struct device_info {
+ AudioDeviceID id = kAudioObjectUnknown;
+ device_flags_value flags = DEV_UNKNOWN;
+ };
+
++#if !TARGET_OS_IPHONE
+ struct property_listener {
+ AudioDeviceID device_id;
+ const AudioObjectPropertyAddress * property_address;
+ AudioObjectPropertyListenerProc callback;
+ cubeb_stream * stream;
+
+ property_listener(AudioDeviceID id,
+ const AudioObjectPropertyAddress * address,
+ AudioObjectPropertyListenerProc proc, cubeb_stream * stm)
+ : device_id(id), property_address(address), callback(proc), stream(stm)
+ {
+ }
+ };
++#endif
+
+ struct cubeb_stream {
+ explicit cubeb_stream(cubeb * context);
+
+ /* Note: Must match cubeb_stream layout in cubeb.c. */
+ cubeb * context;
+ void * user_ptr = nullptr;
+ /**/
+@@ -252,32 +267,36 @@ struct cubeb_stream {
+ /* Latency requested by the user. */
+ uint32_t latency_frames = 0;
+ atomic<uint32_t> current_latency_frames{0};
+ atomic<uint32_t> total_output_latency_frames{0};
+ unique_ptr<cubeb_resampler, decltype(&cubeb_resampler_destroy)> resampler;
+ /* This is true if a device change callback is currently running. */
+ atomic<bool> switching_device{false};
+ atomic<bool> buffer_size_change_state{false};
++ #if !TARGET_OS_IPHONE
+ AudioDeviceID aggregate_device_id =
+ kAudioObjectUnknown; // the aggregate device id
+ AudioObjectID plugin_id =
+ kAudioObjectUnknown; // used to create aggregate device
++ #endif
+ /* Mixer interface */
+ unique_ptr<cubeb_mixer, decltype(&cubeb_mixer_destroy)> mixer;
+ /* Buffer where remixing/resampling will occur when upmixing is required */
+ /* Only accessed from callback thread */
+ unique_ptr<uint8_t[]> temp_buffer;
+ size_t temp_buffer_size = 0; // size in bytes.
++ #if !TARGET_OS_IPHONE
+ /* Listeners indicating what system events are monitored. */
+ unique_ptr<property_listener> default_input_listener;
+ unique_ptr<property_listener> default_output_listener;
+ unique_ptr<property_listener> input_alive_listener;
+ unique_ptr<property_listener> input_source_listener;
+ unique_ptr<property_listener> output_source_listener;
++ #endif
+ };
+
+ bool
+ has_input(cubeb_stream * stm)
+ {
+ return stm->input_stream_params.rate != 0;
+ }
+
+@@ -381,24 +400,16 @@ bool
+ is_common_sample_rate(Float64 sample_rate)
+ {
+ /* Some commonly used sample rates and their multiples and divisors. */
+ return sample_rate == 8000 || sample_rate == 16000 || sample_rate == 22050 ||
+ sample_rate == 32000 || sample_rate == 44100 || sample_rate == 48000 ||
+ sample_rate == 88200 || sample_rate == 96000;
+ }
+
+-#if TARGET_OS_IPHONE
+-typedef UInt32 AudioDeviceID;
+-typedef UInt32 AudioObjectID;
+-
+-#define AudioGetCurrentHostTime mach_absolute_time
+-
+-#endif
+-
+ uint64_t
+ ConvertHostTimeToNanos(uint64_t host_time)
+ {
+ static struct mach_timebase_info timebase_info;
+ static bool initialized = false;
+ if (!initialized) {
+ mach_timebase_info(&timebase_info);
+ initialized = true;
+@@ -756,23 +767,23 @@ audiounit_init(cubeb ** context, char co
+ }
+
+ static char const *
+ audiounit_get_backend_id(cubeb * /* ctx */)
+ {
+ return "audiounit";
+ }
+
+-#if !TARGET_OS_IPHONE
+
+ static int
+ audiounit_stream_get_volume(cubeb_stream * stm, float * volume);
+ static int
+ audiounit_stream_set_volume(cubeb_stream * stm, float volume);
+
++#if !TARGET_OS_IPHONE
+ static int
+ audiounit_set_device_info(cubeb_stream * stm, AudioDeviceID id, io_side side)
+ {
+ assert(stm);
+
+ device_info * info = nullptr;
+ cubeb_device_type type = CUBEB_DEVICE_TYPE_UNKNOWN;
+
+@@ -806,42 +817,47 @@ audiounit_set_device_info(cubeb_stream *
+ }
+
+ assert(info->id);
+ assert(info->flags & DEV_INPUT && !(info->flags & DEV_OUTPUT) ||
+ !(info->flags & DEV_INPUT) && info->flags & DEV_OUTPUT);
+
+ return CUBEB_OK;
+ }
++#endif
+
+ static int
+ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
+ {
+ auto_lock context_lock(stm->context->mutex);
+ assert((flags & DEV_INPUT && stm->input_unit) ||
+ (flags & DEV_OUTPUT && stm->output_unit));
+ if (!stm->shutdown) {
+ audiounit_stream_stop_internal(stm);
+ }
+
+- int r = audiounit_uninstall_device_changed_callback(stm);
++ int r;
++#if !TARGET_OS_IPHONE
++ r = audiounit_uninstall_device_changed_callback(stm);
+ if (r != CUBEB_OK) {
+ LOG("(%p) Could not uninstall all device change listeners.", stm);
+ }
++#endif
+
+ {
+ auto_lock lock(stm->mutex);
+ float volume = 0.0;
+ int vol_rv = CUBEB_ERROR;
+ if (stm->output_unit) {
+ vol_rv = audiounit_stream_get_volume(stm, &volume);
+ }
+
+ audiounit_close_stream(stm);
+
++ #if !TARGET_OS_IPHONE
+ /* Reinit occurs in one of the following case:
+ * - When the device is not alive any more
+ * - When the default system device change.
+ * - The bluetooth device changed from A2DP to/from HFP/HSP profile
+ * We first attempt to re-use the same device id, should that fail we will
+ * default to the (potentially new) default device. */
+ AudioDeviceID input_device =
+ flags & DEV_INPUT ? stm->input_device.id : kAudioObjectUnknown;
+@@ -861,29 +877,33 @@ audiounit_reinit_stream(cubeb_stream * s
+ r = audiounit_set_device_info(stm, kAudioObjectUnknown, io_side::OUTPUT);
+ if (r != CUBEB_OK) {
+ LOG("(%p) Set output device info failed. This can happen when last media "
+ "device is unplugged",
+ stm);
+ return CUBEB_ERROR;
+ }
+
++ #endif
++
+ if (audiounit_setup_stream(stm) != CUBEB_OK) {
+ LOG("(%p) Stream reinit failed.", stm);
++ #if !TARGET_OS_IPHONE
+ if (flags & DEV_INPUT && input_device != kAudioObjectUnknown) {
+ // Attempt to re-use the same device-id failed, so attempt again with
+ // default input device.
+ audiounit_close_stream(stm);
+ if (audiounit_set_device_info(stm, kAudioObjectUnknown,
+ io_side::INPUT) != CUBEB_OK ||
+ audiounit_setup_stream(stm) != CUBEB_OK) {
+ LOG("(%p) Second stream reinit failed.", stm);
+ return CUBEB_ERROR;
+ }
+ }
++ #endif
+ }
+
+ if (vol_rv == CUBEB_OK) {
+ audiounit_stream_set_volume(stm, volume);
+ }
+
+ // If the stream was running, start it again.
+ if (!stm->shutdown) {
+@@ -909,27 +929,30 @@ audiounit_reinit_stream_async(cubeb_stre
+ // Get/SetProperties method from inside notify callback
+ dispatch_async(stm->context->serial_queue, ^() {
+ if (stm->destroy_pending) {
+ ALOG("(%p) stream pending destroy, cancelling reinit task", stm);
+ return;
+ }
+
+ if (audiounit_reinit_stream(stm, flags) != CUBEB_OK) {
++ #if !TARGET_OS_IPHONE
+ if (audiounit_uninstall_system_changed_callback(stm) != CUBEB_OK) {
+ LOG("(%p) Could not uninstall system changed callback", stm);
+ }
++ #endif
+ stm->state_callback(stm, stm->user_ptr, CUBEB_STATE_ERROR);
+ LOG("(%p) Could not reopen the stream after switching.", stm);
+ }
+ stm->switching_device = false;
+ stm->reinit_pending = false;
+ });
+ }
+
++#if !TARGET_OS_IPHONE
+ static char const *
+ event_addr_to_string(AudioObjectPropertySelector selector)
+ {
+ switch (selector) {
+ case kAudioHardwarePropertyDefaultOutputDevice:
+ return "kAudioHardwarePropertyDefaultOutputDevice";
+ case kAudioHardwarePropertyDefaultInputDevice:
+ return "kAudioHardwarePropertyDefaultInputDevice";
+@@ -1091,16 +1114,17 @@ audiounit_install_device_changed_callbac
+ rv, stm->input_device.id);
+ r = CUBEB_ERROR;
+ }
+ }
+
+ return r;
+ }
+
++#if !TARGET_OS_IPHONE
+ static int
+ audiounit_install_system_changed_callback(cubeb_stream * stm)
+ {
+ OSStatus r;
+
+ if (stm->output_unit) {
+ /* This event will notify us when the default audio device changes,
+ * for example when the user plugs in a USB headset and the system chooses
+@@ -1131,16 +1155,17 @@ audiounit_install_system_changed_callbac
+ "kAudioHardwarePropertyDefaultInputDevice rv=%d",
+ r);
+ return CUBEB_ERROR;
+ }
+ }
+
+ return CUBEB_OK;
+ }
++#endif
+
+ static int
+ audiounit_uninstall_device_changed_callback(cubeb_stream * stm)
+ {
+ OSStatus rv;
+ // Failing to uninstall listeners is not a fatal error.
+ int r = CUBEB_OK;
+
+@@ -1207,17 +1232,17 @@ audiounit_uninstall_system_changed_callb
+ static int
+ audiounit_get_acceptable_latency_range(AudioValueRange * latency_range)
+ {
+ UInt32 size;
+ OSStatus r;
+ AudioDeviceID output_device_id;
+ AudioObjectPropertyAddress output_device_buffer_size_range = {
+ kAudioDevicePropertyBufferFrameSizeRange, kAudioDevicePropertyScopeOutput,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
+ if (output_device_id == kAudioObjectUnknown) {
+ LOG("Could not get default output device id.");
+ return CUBEB_ERROR;
+ }
+
+ /* Get the buffer size range this device supports */
+@@ -1228,17 +1253,16 @@ audiounit_get_acceptable_latency_range(A
+ &size, latency_range);
+ if (r != noErr) {
+ LOG("AudioObjectGetPropertyData/buffer size range rv=%d", r);
+ return CUBEB_ERROR;
+ }
+
+ return CUBEB_OK;
+ }
+-#endif /* !TARGET_OS_IPHONE */
+
+ static AudioObjectID
+ audiounit_get_default_device_id(cubeb_device_type type)
+ {
+ const AudioObjectPropertyAddress * adr;
+ if (type == CUBEB_DEVICE_TYPE_OUTPUT) {
+ adr = &DEFAULT_OUTPUT_DEVICE_PROPERTY_ADDRESS;
+ } else if (type == CUBEB_DEVICE_TYPE_INPUT) {
+@@ -1251,31 +1275,32 @@ audiounit_get_default_device_id(cubeb_de
+ UInt32 size = sizeof(AudioDeviceID);
+ if (AudioObjectGetPropertyData(kAudioObjectSystemObject, adr, 0, NULL, &size,
+ &devid) != noErr) {
+ return kAudioObjectUnknown;
+ }
+
+ return devid;
+ }
++#endif /* !TARGET_OS_IPHONE */
+
+ int
+ audiounit_get_max_channel_count(cubeb * ctx, uint32_t * max_channels)
+ {
+ #if TARGET_OS_IPHONE
+ // TODO: [[AVAudioSession sharedInstance] maximumOutputNumberOfChannels]
+ *max_channels = 2;
+ #else
+ UInt32 size;
+ OSStatus r;
+ AudioDeviceID output_device_id;
+ AudioStreamBasicDescription stream_format;
+ AudioObjectPropertyAddress stream_format_address = {
+ kAudioDevicePropertyStreamFormat, kAudioDevicePropertyScopeOutput,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ assert(ctx && max_channels);
+
+ output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
+ if (output_device_id == kAudioObjectUnknown) {
+ return CUBEB_ERROR;
+ }
+
+@@ -1304,52 +1329,52 @@ audiounit_get_min_latency(cubeb * /* ctx
+ AudioValueRange latency_range;
+ if (audiounit_get_acceptable_latency_range(&latency_range) != CUBEB_OK) {
+ LOG("Could not get acceptable latency range.");
+ return CUBEB_ERROR;
+ }
+
+ *latency_frames =
+ max<uint32_t>(latency_range.mMinimum, SAFE_MIN_LATENCY_FRAMES);
++ return CUBEB_OK;
+ #endif
+-
+- return CUBEB_OK;
+ }
+
+ static int
+ audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate)
+ {
+ #if TARGET_OS_IPHONE
+- // TODO
+- return CUBEB_ERROR_NOT_SUPPORTED;
++ *rate = 44100;
++ return CUBEB_OK;
+ #else
+ UInt32 size;
+ OSStatus r;
+ Float64 fsamplerate;
+ AudioDeviceID output_device_id;
+ AudioObjectPropertyAddress samplerate_address = {
+ kAudioDevicePropertyNominalSampleRate, kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
+ if (output_device_id == kAudioObjectUnknown) {
+ return CUBEB_ERROR;
+ }
+
+ size = sizeof(fsamplerate);
+ r = AudioObjectGetPropertyData(output_device_id, &samplerate_address, 0, NULL,
+ &size, &fsamplerate);
+
+ if (r != noErr) {
+ return CUBEB_ERROR;
+ }
+
+ *rate = static_cast<uint32_t>(fsamplerate);
++
++ return CUBEB_OK;
+ #endif
+- return CUBEB_OK;
+ }
+
+ static cubeb_channel_layout
+ audiounit_convert_channel_layout(AudioChannelLayout * layout)
+ {
+ // When having one or two channel, force mono or stereo. Some devices (namely,
+ // Bose QC35, mark 1 and 2), expose a single channel mapped to the right for
+ // some reason.
+@@ -1380,16 +1405,19 @@ audiounit_convert_channel_layout(AudioCh
+ }
+
+ return cl;
+ }
+
+ static cubeb_channel_layout
+ audiounit_get_preferred_channel_layout(AudioUnit output_unit)
+ {
++ #if TARGET_OS_IPHONE
++ return CUBEB_LAYOUT_STEREO;
++ #else
+ OSStatus rv = noErr;
+ UInt32 size = 0;
+ rv = AudioUnitGetPropertyInfo(
+ output_unit, kAudioDevicePropertyPreferredChannelLayout,
+ kAudioUnitScope_Output, AU_OUT_BUS, &size, nullptr);
+ if (rv != noErr) {
+ LOG("AudioUnitGetPropertyInfo/kAudioDevicePropertyPreferredChannelLayout "
+ "rv=%d",
+@@ -1404,16 +1432,17 @@ audiounit_get_preferred_channel_layout(A
+ kAudioUnitScope_Output, AU_OUT_BUS, layout.get(), &size);
+ if (rv != noErr) {
+ LOG("AudioUnitGetProperty/kAudioDevicePropertyPreferredChannelLayout rv=%d",
+ rv);
+ return CUBEB_LAYOUT_UNDEFINED;
+ }
+
+ return audiounit_convert_channel_layout(layout.get());
++ #endif
+ }
+
+ static cubeb_channel_layout
+ audiounit_get_current_channel_layout(AudioUnit output_unit)
+ {
+ OSStatus rv = noErr;
+ UInt32 size = 0;
+ rv = AudioUnitGetPropertyInfo(
+@@ -1437,18 +1466,20 @@ audiounit_get_current_channel_layout(Aud
+ }
+
+ return audiounit_convert_channel_layout(layout.get());
+ }
+
+ static int
+ audiounit_create_unit(AudioUnit * unit, device_info * device);
+
++#if !TARGET_OS_IPHONE
+ static OSStatus
+ audiounit_remove_device_listener(cubeb * context, cubeb_device_type devtype);
++#endif
+
+ static void
+ audiounit_destroy(cubeb * ctx)
+ {
+ {
+ auto_lock lock(ctx->mutex);
+
+ // Disabling this assert for bug 1083664 -- we seem to leak a stream
+@@ -1460,23 +1491,25 @@ audiounit_destroy(cubeb * ctx)
+
+ // Destroying a cubeb context with device collection callbacks registered
+ // is misuse of the API, assert then attempt to clean up.
+ assert(!ctx->input_collection_changed_callback &&
+ !ctx->input_collection_changed_user_ptr &&
+ !ctx->output_collection_changed_callback &&
+ !ctx->output_collection_changed_user_ptr);
+
++ #if !TARGET_OS_IPHONE
+ /* Unregister the callback if necessary. */
+ if (ctx->input_collection_changed_callback) {
+ audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_INPUT);
+ }
+ if (ctx->output_collection_changed_callback) {
+ audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_OUTPUT);
+ }
++ #endif
+ }
+
+ dispatch_release(ctx->serial_queue);
+
+ delete ctx;
+ }
+
+ static void
+@@ -1594,23 +1627,24 @@ audiounit_layout_init(cubeb_stream * stm
+ }
+
+ stm->context->layout = audiounit_get_current_channel_layout(stm->output_unit);
+
+ audiounit_set_channel_layout(stm->output_unit, io_side::OUTPUT,
+ stm->context->layout);
+ }
+
++#if !TARGET_OS_IPHONE
+ static vector<AudioObjectID>
+ audiounit_get_sub_devices(AudioDeviceID device_id)
+ {
+ vector<AudioDeviceID> sub_devices;
+ AudioObjectPropertyAddress property_address = {
+ kAudioAggregateDevicePropertyActiveSubDeviceList,
+- kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
+ UInt32 size = 0;
+ OSStatus rv = AudioObjectGetPropertyDataSize(device_id, &property_address, 0,
+ nullptr, &size);
+
+ if (rv != noErr) {
+ sub_devices.push_back(device_id);
+ return sub_devices;
+ }
+@@ -1629,17 +1663,17 @@ audiounit_get_sub_devices(AudioDeviceID
+ }
+
+ static int
+ audiounit_create_blank_aggregate_device(AudioObjectID * plugin_id,
+ AudioDeviceID * aggregate_device_id)
+ {
+ AudioObjectPropertyAddress address_plugin_bundle_id = {
+ kAudioHardwarePropertyPlugInForBundleID, kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+ UInt32 size = 0;
+ OSStatus r = AudioObjectGetPropertyDataSize(
+ kAudioObjectSystemObject, &address_plugin_bundle_id, 0, NULL, &size);
+ if (r != noErr) {
+ LOG("AudioObjectGetPropertyDataSize/"
+ "kAudioHardwarePropertyPlugInForBundleID, rv=%d",
+ r);
+ return CUBEB_ERROR;
+@@ -1659,17 +1693,17 @@ audiounit_create_blank_aggregate_device(
+ LOG("AudioObjectGetPropertyData/kAudioHardwarePropertyPlugInForBundleID, "
+ "rv=%d",
+ r);
+ return CUBEB_ERROR;
+ }
+
+ AudioObjectPropertyAddress create_aggregate_device_address = {
+ kAudioPlugInCreateAggregateDevice, kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+ r = AudioObjectGetPropertyDataSize(
+ *plugin_id, &create_aggregate_device_address, 0, nullptr, &size);
+ if (r != noErr) {
+ LOG("AudioObjectGetPropertyDataSize/kAudioPlugInCreateAggregateDevice, "
+ "rv=%d",
+ r);
+ return CUBEB_ERROR;
+ }
+@@ -1731,17 +1765,17 @@ audiounit_create_blank_aggregate_device(
+ // object is increased.
+ static CFStringRef
+ get_device_name(AudioDeviceID id)
+ {
+ UInt32 size = sizeof(CFStringRef);
+ CFStringRef UIname = nullptr;
+ AudioObjectPropertyAddress address_uuid = {kAudioDevicePropertyDeviceUID,
+ kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+ OSStatus err =
+ AudioObjectGetPropertyData(id, &address_uuid, 0, nullptr, &size, &UIname);
+ return (err == noErr) ? UIname : NULL;
+ }
+
+ static int
+ audiounit_set_aggregate_sub_device_list(AudioDeviceID aggregate_device_id,
+ AudioDeviceID input_device_id,
+@@ -1774,17 +1808,17 @@ audiounit_set_aggregate_sub_device_list(
+ return CUBEB_ERROR;
+ }
+ CFArrayAppendValue(aggregate_sub_devices_array, ref);
+ CFRelease(ref);
+ }
+
+ AudioObjectPropertyAddress aggregate_sub_device_list = {
+ kAudioAggregateDevicePropertyFullSubDeviceList,
+- kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
+ UInt32 size = sizeof(CFMutableArrayRef);
+ OSStatus rv = AudioObjectSetPropertyData(
+ aggregate_device_id, &aggregate_sub_device_list, 0, nullptr, size,
+ &aggregate_sub_devices_array);
+ CFRelease(aggregate_sub_devices_array);
+ if (rv != noErr) {
+ LOG("AudioObjectSetPropertyData/"
+ "kAudioAggregateDevicePropertyFullSubDeviceList, rv=%d",
+@@ -1796,17 +1830,17 @@ audiounit_set_aggregate_sub_device_list(
+ }
+
+ static int
+ audiounit_set_master_aggregate_device(const AudioDeviceID aggregate_device_id)
+ {
+ assert(aggregate_device_id != kAudioObjectUnknown);
+ AudioObjectPropertyAddress master_aggregate_sub_device = {
+ kAudioAggregateDevicePropertyMasterSubDevice,
+- kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
+
+ // Master become the 1st output sub device
+ AudioDeviceID output_device_id =
+ audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
+ const vector<AudioDeviceID> output_sub_devices =
+ audiounit_get_sub_devices(output_device_id);
+ CFStringRef master_sub_device = get_device_name(output_sub_devices[0]);
+
+@@ -1829,17 +1863,17 @@ audiounit_set_master_aggregate_device(co
+
+ static int
+ audiounit_activate_clock_drift_compensation(
+ const AudioDeviceID aggregate_device_id)
+ {
+ assert(aggregate_device_id != kAudioObjectUnknown);
+ AudioObjectPropertyAddress address_owned = {
+ kAudioObjectPropertyOwnedObjects, kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ UInt32 qualifier_data_size = sizeof(AudioObjectID);
+ AudioClassID class_id = kAudioSubDeviceClassID;
+ void * qualifier_data = &class_id;
+ UInt32 size = 0;
+ OSStatus rv = AudioObjectGetPropertyDataSize(
+ aggregate_device_id, &address_owned, qualifier_data_size, qualifier_data,
+ &size);
+@@ -1861,17 +1895,17 @@ audiounit_activate_clock_drift_compensat
+ if (rv != noErr) {
+ LOG("AudioObjectGetPropertyData/kAudioObjectPropertyOwnedObjects, rv=%d",
+ rv);
+ return CUBEB_ERROR;
+ }
+
+ AudioObjectPropertyAddress address_drift = {
+ kAudioSubDevicePropertyDriftCompensation, kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ // Start from the second device since the first is the master clock
+ for (UInt32 i = 1; i < subdevices_num; ++i) {
+ UInt32 drift_compensation_value = 1;
+ rv = AudioObjectSetPropertyData(sub_devices[i], &address_drift, 0, nullptr,
+ sizeof(UInt32), &drift_compensation_value);
+ if (rv != noErr) {
+ LOG("AudioObjectSetPropertyData/"
+@@ -1930,17 +1964,17 @@ audiounit_workaround_for_airpod(cubeb_st
+ &output_min_rate, &output_max_rate, &output_nominal_rate);
+ LOG("(%p) Output device %u, name: %s, min: %u, max: %u, nominal rate: %u",
+ stm, stm->output_device.id, output_device_info.friendly_name,
+ output_min_rate, output_max_rate, output_nominal_rate);
+
+ Float64 rate = input_nominal_rate;
+ AudioObjectPropertyAddress addr = {kAudioDevicePropertyNominalSampleRate,
+ kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ OSStatus rv = AudioObjectSetPropertyData(stm->aggregate_device_id, &addr, 0,
+ nullptr, sizeof(Float64), &rate);
+ if (rv != noErr) {
+ LOG("Non fatal error, "
+ "AudioObjectSetPropertyData/kAudioDevicePropertyNominalSampleRate, "
+ "rv=%d",
+ rv);
+@@ -2014,17 +2048,17 @@ audiounit_create_aggregate_device(cubeb_
+ static int
+ audiounit_destroy_aggregate_device(AudioObjectID plugin_id,
+ AudioDeviceID * aggregate_device_id)
+ {
+ assert(aggregate_device_id && *aggregate_device_id != kAudioDeviceUnknown &&
+ plugin_id != kAudioObjectUnknown);
+ AudioObjectPropertyAddress destroy_aggregate_device_addr = {
+ kAudioPlugInDestroyAggregateDevice, kAudioObjectPropertyScopeGlobal,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+ UInt32 size;
+ OSStatus rv = AudioObjectGetPropertyDataSize(
+ plugin_id, &destroy_aggregate_device_addr, 0, NULL, &size);
+ if (rv != noErr) {
+ LOG("AudioObjectGetPropertyDataSize/kAudioPlugInDestroyAggregateDevice, "
+ "rv=%d",
+ rv);
+ return CUBEB_ERROR;
+@@ -2037,16 +2071,17 @@ audiounit_destroy_aggregate_device(Audio
+ rv);
+ return CUBEB_ERROR;
+ }
+
+ LOG("Destroyed aggregate device %d", *aggregate_device_id);
+ *aggregate_device_id = kAudioObjectUnknown;
+ return CUBEB_OK;
+ }
++#endif
+
+ static int
+ audiounit_new_unit_instance(AudioUnit * unit, device_info * device)
+ {
+ AudioComponentDescription desc;
+ AudioComponent comp;
+ OSStatus rv;
+
+@@ -2173,16 +2208,19 @@ audiounit_init_input_linear_buffer(cubeb
+ assert(stream->input_linear_buffer->length() == 0);
+
+ return CUBEB_OK;
+ }
+
+ static uint32_t
+ audiounit_clamp_latency(cubeb_stream * stm, uint32_t latency_frames)
+ {
++ #if TARGET_OS_IPHONE
++ return latency_frames;
++ #else
+ // For the 1st stream set anything within safe min-max
+ assert(audiounit_active_streams(stm->context) > 0);
+ if (audiounit_active_streams(stm->context) == 1) {
+ return max(min<uint32_t>(latency_frames, SAFE_MAX_LATENCY_FRAMES),
+ SAFE_MIN_LATENCY_FRAMES);
+ }
+ assert(stm->output_unit);
+
+@@ -2233,18 +2271,20 @@ audiounit_clamp_latency(cubeb_stream * s
+ } else if (output_buffer_size != 0) {
+ upper_latency_limit = output_buffer_size;
+ } else {
+ upper_latency_limit = SAFE_MAX_LATENCY_FRAMES;
+ }
+
+ return max(min<uint32_t>(latency_frames, upper_latency_limit),
+ SAFE_MIN_LATENCY_FRAMES);
++ #endif
+ }
+
++#if !TARGET_OS_IPHONE
+ /*
+ * Change buffer size is prone to deadlock thus we change it
+ * following the steps:
+ * - register a listener for the buffer size property
+ * - change the property
+ * - wait until the listener is executed
+ * - property has changed, remove the listener
+ * */
+@@ -2285,21 +2325,25 @@ buffer_size_changed_callback(void * inCl
+ "= %d for scope %d",
+ stm, au_type, new_buffer_size, inScope);
+ }
+ stm->buffer_size_change_state = true;
+ break;
+ }
+ }
+ }
++#endif
+
+ static int
+ audiounit_set_buffer_size(cubeb_stream * stm, uint32_t new_size_frames,
+ io_side side)
+ {
++ #if TARGET_OS_IPHONE
++ return CUBEB_OK;
++ #else
+ AudioUnit au = stm->output_unit;
+ AudioUnitScope au_scope = kAudioUnitScope_Input;
+ AudioUnitElement au_element = AU_OUT_BUS;
+
+ if (side == io_side::INPUT) {
+ au = stm->input_unit;
+ au_scope = kAudioUnitScope_Output;
+ au_element = AU_IN_BUS;
+@@ -2377,16 +2421,17 @@ audiounit_set_buffer_size(cubeb_stream *
+ if (!stm->buffer_size_change_state && count >= 30) {
+ LOG("(%p) Error, did not get buffer size change callback ...", stm);
+ return CUBEB_ERROR;
+ }
+
+ LOG("(%p) %s buffer size changed to %u frames.", stm, to_string(side),
+ new_size_frames);
+ return CUBEB_OK;
++ #endif
+ }
+
+ static int
+ audiounit_configure_input(cubeb_stream * stm)
+ {
+ assert(stm && stm->input_unit);
+
+ int r = 0;
+@@ -2593,16 +2638,17 @@ audiounit_setup_stream(cubeb_stream * st
+ return CUBEB_ERROR_NOT_SUPPORTED;
+ }
+
+ int r = 0;
+
+ device_info in_dev_info = stm->input_device;
+ device_info out_dev_info = stm->output_device;
+
++ #if !TARGET_OS_IPHONE
+ if (has_input(stm) && has_output(stm) &&
+ stm->input_device.id != stm->output_device.id) {
+ r = audiounit_create_aggregate_device(stm);
+ if (r != CUBEB_OK) {
+ stm->aggregate_device_id = kAudioObjectUnknown;
+ LOG("(%p) Create aggregate devices failed.", stm);
+ // !!!NOTE: It is not necessary to return here. If it does not
+ // return it will fallback to the old implementation. The intention
+@@ -2610,16 +2656,20 @@ audiounit_setup_stream(cubeb_stream * st
+ // it after a couple of weeks.
+ return r;
+ } else {
+ in_dev_info.id = out_dev_info.id = stm->aggregate_device_id;
+ in_dev_info.flags = DEV_INPUT;
+ out_dev_info.flags = DEV_OUTPUT;
+ }
+ }
++ #else
++ in_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_INPUT;
++ out_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_OUTPUT;
++ #endif
+
+ if (has_input(stm)) {
+ r = audiounit_create_unit(&stm->input_unit, &in_dev_info);
+ if (r != CUBEB_OK) {
+ LOG("(%p) AudioUnit creation for input failed.", stm);
+ return r;
+ }
+ }
+@@ -2751,18 +2801,20 @@ audiounit_setup_stream(cubeb_stream * st
+
+ if (stm->output_unit != NULL) {
+ r = AudioUnitInitialize(stm->output_unit);
+ if (r != noErr) {
+ LOG("AudioUnitInitialize/output rv=%d", r);
+ return CUBEB_ERROR;
+ }
+
++ #if !TARGET_OS_IPHONE
+ stm->current_latency_frames = audiounit_get_device_presentation_latency(
+ stm->output_device.id, kAudioDevicePropertyScopeOutput);
++ #endif
+
+ Float64 unit_s;
+ UInt32 size = sizeof(unit_s);
+ if (AudioUnitGetProperty(stm->output_unit, kAudioUnitProperty_Latency,
+ kAudioUnitScope_Global, 0, &unit_s,
+ &size) == noErr) {
+ stm->current_latency_frames +=
+ static_cast<uint32_t>(unit_s * stm->output_desc.mSampleRate);
+@@ -2772,20 +2824,22 @@ audiounit_setup_stream(cubeb_stream * st
+ if (stm->input_unit && stm->output_unit) {
+ // According to the I/O hardware rate it is expected a specific pattern of
+ // callbacks for example is input is 44100 and output is 48000 we expected
+ // no more than 2 out callback in a row.
+ stm->expected_output_callbacks_in_a_row =
+ ceilf(stm->output_hw_rate / stm->input_hw_rate);
+ }
+
++ #if !TARGET_OS_IPHONE
+ r = audiounit_install_device_changed_callback(stm);
+ if (r != CUBEB_OK) {
+ LOG("(%p) Could not install all device change callback.", stm);
+ }
++ #endif
+
+ return CUBEB_OK;
+ }
+
+ cubeb_stream::cubeb_stream(cubeb * context)
+ : context(context), resampler(nullptr, cubeb_resampler_destroy),
+ mixer(nullptr, cubeb_mixer_destroy)
+ {
+@@ -2823,51 +2877,57 @@ audiounit_stream_init(cubeb * context, c
+ stm->latency_frames = latency_frames;
+
+ if ((input_device && !input_stream_params) ||
+ (output_device && !output_stream_params)) {
+ return CUBEB_ERROR_INVALID_PARAMETER;
+ }
+ if (input_stream_params) {
+ stm->input_stream_params = *input_stream_params;
++ #if !TARGET_OS_IPHONE
+ r = audiounit_set_device_info(
+ stm.get(), reinterpret_cast<uintptr_t>(input_device), io_side::INPUT);
+ if (r != CUBEB_OK) {
+ LOG("(%p) Fail to set device info for input.", stm.get());
+ return r;
+ }
++ #endif
+ }
+ if (output_stream_params) {
+ stm->output_stream_params = *output_stream_params;
++ #if !TARGET_OS_IPHONE
+ r = audiounit_set_device_info(
+ stm.get(), reinterpret_cast<uintptr_t>(output_device), io_side::OUTPUT);
+ if (r != CUBEB_OK) {
+ LOG("(%p) Fail to set device info for output.", stm.get());
+ return r;
+ }
++ #endif
+ }
+
+ {
+ // It's not critical to lock here, because no other thread has been started
+ // yet, but it allows to assert that the lock has been taken in
+ // `audiounit_setup_stream`.
+ auto_lock lock(stm->mutex);
+ r = audiounit_setup_stream(stm.get());
+ }
+
+ if (r != CUBEB_OK) {
+ LOG("(%p) Could not setup the audiounit stream.", stm.get());
+ return r;
+ }
+
++ #if !TARGET_OS_IPHONE
+ r = audiounit_install_system_changed_callback(stm.get());
+ if (r != CUBEB_OK) {
+ LOG("(%p) Could not install the device change callback.", stm.get());
+ return r;
+ }
++ #endif
+
+ *stream = stm.release();
+ LOG("(%p) Cubeb stream init successful.", *stream);
+ return CUBEB_OK;
+ }
+
+ static void
+ audiounit_close_stream(cubeb_stream * stm)
+@@ -2886,54 +2946,60 @@ audiounit_close_stream(cubeb_stream * st
+ AudioUnitUninitialize(stm->output_unit);
+ AudioComponentInstanceDispose(stm->output_unit);
+ stm->output_unit = nullptr;
+ }
+
+ stm->resampler.reset();
+ stm->mixer.reset();
+
++ #if !TARGET_OS_IPHONE
+ if (stm->aggregate_device_id != kAudioObjectUnknown) {
+ audiounit_destroy_aggregate_device(stm->plugin_id,
+ &stm->aggregate_device_id);
+ stm->aggregate_device_id = kAudioObjectUnknown;
+ }
++ #endif
+ }
+
+ static void
+ audiounit_stream_destroy_internal(cubeb_stream * stm)
+ {
+ stm->context->mutex.assert_current_thread_owns();
+
++#if !TARGET_OS_IPHONE
+ int r = audiounit_uninstall_system_changed_callback(stm);
+ if (r != CUBEB_OK) {
+ LOG("(%p) Could not uninstall the device changed callback", stm);
+ }
+ r = audiounit_uninstall_device_changed_callback(stm);
+ if (r != CUBEB_OK) {
+ LOG("(%p) Could not uninstall all device change listeners", stm);
+ }
++#endif
+
+ auto_lock lock(stm->mutex);
+ audiounit_close_stream(stm);
+ assert(audiounit_active_streams(stm->context) >= 1);
+ audiounit_decrement_active_streams(stm->context);
+ }
+
+ static void
+ audiounit_stream_destroy(cubeb_stream * stm)
+ {
++ #if !TARGET_OS_IPHONE
+ int r = audiounit_uninstall_system_changed_callback(stm);
+ if (r != CUBEB_OK) {
+ LOG("(%p) Could not uninstall the device changed callback", stm);
+ }
+ r = audiounit_uninstall_device_changed_callback(stm);
+ if (r != CUBEB_OK) {
+ LOG("(%p) Could not uninstall all device change listeners", stm);
+ }
++ #endif
+
+ if (!stm->shutdown.load()) {
+ auto_lock context_lock(stm->context->mutex);
+ audiounit_stream_stop_internal(stm);
+ stm->shutdown = true;
+ }
+
+ stm->destroy_pending = true;
+@@ -3081,16 +3147,17 @@ convert_uint32_into_string(UInt32 data)
+ // Reverse 0xWXYZ into 0xZYXW.
+ str[0] = (char)(data >> 24);
+ str[1] = (char)(data >> 16);
+ str[2] = (char)(data >> 8);
+ str[3] = (char)(data);
+ return str;
+ }
+
++#if !TARGET_OS_IPHONE
+ int
+ audiounit_get_default_device_datasource(cubeb_device_type type, UInt32 * data)
+ {
+ AudioDeviceID id = audiounit_get_default_device_id(type);
+ if (id == kAudioObjectUnknown) {
+ return CUBEB_ERROR;
+ }
+
+@@ -3102,38 +3169,43 @@ audiounit_get_default_device_datasource(
+ : &OUTPUT_DATA_SOURCE_PROPERTY_ADDRESS,
+ 0, NULL, &size, data);
+ if (r != noErr) {
+ *data = 0;
+ }
+
+ return CUBEB_OK;
+ }
++#endif
+
+ int
+ audiounit_get_default_device_name(cubeb_stream * stm,
+ cubeb_device * const device,
+ cubeb_device_type type)
+ {
++#if TARGET_OS_IPHONE
++ return CUBEB_ERROR_NOT_SUPPORTED;
++#else
+ assert(stm);
+ assert(device);
+
+ UInt32 data;
+ int r = audiounit_get_default_device_datasource(type, &data);
+ if (r != CUBEB_OK) {
+ return r;
+ }
+ char ** name = type == CUBEB_DEVICE_TYPE_INPUT ? &device->input_name
+ : &device->output_name;
+ *name = convert_uint32_into_string(data).release();
+ if (!strlen(*name)) { // empty string.
+ LOG("(%p) name of %s device is empty!", stm,
+ type == CUBEB_DEVICE_TYPE_INPUT ? "input" : "output");
+ }
+ return CUBEB_OK;
++ #endif
+ }
+
+ int
+ audiounit_stream_get_current_device(cubeb_stream * stm,
+ cubeb_device ** const device)
+ {
+ #if TARGET_OS_IPHONE
+ // TODO
+@@ -3178,16 +3250,17 @@ audiounit_stream_register_device_changed
+ auto_lock dev_cb_lock(stream->device_changed_callback_lock);
+ /* Note: second register without unregister first causes 'nope' error.
+ * Current implementation requires unregister before register a new cb. */
+ assert(!device_changed_callback || !stream->device_changed_callback);
+ stream->device_changed_callback = device_changed_callback;
+ return CUBEB_OK;
+ }
+
++#if !TARGET_OS_IPHONE
+ static char *
+ audiounit_strref_to_cstr_utf8(CFStringRef strref)
+ {
+ CFIndex len, size;
+ char * ret;
+ if (strref == NULL) {
+ return NULL;
+ }
+@@ -3199,22 +3272,24 @@ audiounit_strref_to_cstr_utf8(CFStringRe
+
+ if (!CFStringGetCString(strref, ret, size, kCFStringEncodingUTF8)) {
+ delete[] ret;
+ ret = NULL;
+ }
+
+ return ret;
+ }
+-
++#endif
++
++#if !TARGET_OS_IPHONE
+ static uint32_t
+ audiounit_get_channel_count(AudioObjectID devid, AudioObjectPropertyScope scope)
+ {
+ AudioObjectPropertyAddress adr = {0, scope,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+ UInt32 size = 0;
+ uint32_t i, ret = 0;
+
+ adr.mSelector = kAudioDevicePropertyStreamConfiguration;
+
+ if (AudioObjectGetPropertyDataSize(devid, &adr, 0, NULL, &size) == noErr &&
+ size > 0) {
+ AudioBufferList * list = static_cast<AudioBufferList *>(alloca(size));
+@@ -3230,17 +3305,17 @@ audiounit_get_channel_count(AudioObjectI
+
+ static void
+ audiounit_get_available_samplerate(AudioObjectID devid,
+ AudioObjectPropertyScope scope,
+ uint32_t * min, uint32_t * max,
+ uint32_t * def)
+ {
+ AudioObjectPropertyAddress adr = {0, scope,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+
+ adr.mSelector = kAudioDevicePropertyNominalSampleRate;
+ if (AudioObjectHasProperty(devid, &adr)) {
+ UInt32 size = sizeof(Float64);
+ Float64 fvalue = 0.0;
+ if (AudioObjectGetPropertyData(devid, &adr, 0, NULL, &size, &fvalue) ==
+ noErr) {
+ *def = fvalue;
+@@ -3272,17 +3347,17 @@ audiounit_get_available_samplerate(Audio
+ }
+ }
+
+ static UInt32
+ audiounit_get_device_presentation_latency(AudioObjectID devid,
+ AudioObjectPropertyScope scope)
+ {
+ AudioObjectPropertyAddress adr = {0, scope,
+- kAudioObjectPropertyElementMaster};
++ kAudioObjectPropertyElementMain};
+ UInt32 size, dev, stream = 0;
+ AudioStreamID sid[1];
+
+ adr.mSelector = kAudioDevicePropertyLatency;
+ size = sizeof(UInt32);
+ if (AudioObjectGetPropertyData(devid, &adr, 0, NULL, &size, &dev) != noErr) {
+ dev = 0;
+ }
+@@ -3297,28 +3372,32 @@ audiounit_get_device_presentation_latenc
+
+ return dev + stream;
+ }
+
+ static int
+ audiounit_create_device_from_hwdev(cubeb_device_info * dev_info,
+ AudioObjectID devid, cubeb_device_type type)
+ {
+- AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMaster};
++ AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMain};
+ UInt32 size;
+
+ if (type == CUBEB_DEVICE_TYPE_OUTPUT) {
+ adr.mScope = kAudioDevicePropertyScopeOutput;
+ } else if (type == CUBEB_DEVICE_TYPE_INPUT) {
+ adr.mScope = kAudioDevicePropertyScopeInput;
+ } else {
+ return CUBEB_ERROR;
+ }
+
++ #if TARGET_OS_IPHONE
++ UINT32 ch = 2;
++ #else
+ UInt32 ch = audiounit_get_channel_count(devid, adr.mScope);
++ #endif
+ if (ch == 0) {
+ return CUBEB_ERROR;
+ }
+
+ PodZero(dev_info, 1);
+
+ CFStringRef device_id_str = nullptr;
+ size = sizeof(CFStringRef);
+@@ -3412,17 +3491,26 @@ audiounit_create_device_from_hwdev(cubeb
+
+ bool
+ is_aggregate_device(cubeb_device_info * device_info)
+ {
+ assert(device_info->friendly_name);
+ return !strncmp(device_info->friendly_name, PRIVATE_AGGREGATE_DEVICE_NAME,
+ strlen(PRIVATE_AGGREGATE_DEVICE_NAME));
+ }
+-
++#endif
++
++#if TARGET_OS_IPHONE
++static int
++audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type,
++ cubeb_device_collection * collection)
++{
++ return CUBEB_ERROR_NOT_SUPPORTED;
++}
++#else
+ static int
+ audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type,
+ cubeb_device_collection * collection)
+ {
+ vector<AudioObjectID> input_devs;
+ vector<AudioObjectID> output_devs;
+
+ // Count number of input and output devices. This is not
+@@ -3478,29 +3566,35 @@ audiounit_enumerate_devices(cubeb * /* c
+
+ static void
+ audiounit_device_destroy(cubeb_device_info * device)
+ {
+ delete[] device->device_id;
+ delete[] device->friendly_name;
+ delete[] device->vendor_name;
+ }
++#endif
+
+ static int
+ audiounit_device_collection_destroy(cubeb * /* context */,
+ cubeb_device_collection * collection)
+ {
++ #if TARGET_OS_IPHONE
++ return CUBEB_ERROR_NOT_SUPPORTED;
++ #else
+ for (size_t i = 0; i < collection->count; i++) {
+ audiounit_device_destroy(&collection->device[i]);
+ }
+ delete[] collection->device;
+
+ return CUBEB_OK;
++ #endif
+ }
+
++#if !TARGET_OS_IPHONE
+ static vector<AudioObjectID>
+ audiounit_get_devices_of_type(cubeb_device_type devtype)
+ {
+ UInt32 size = 0;
+ OSStatus ret = AudioObjectGetPropertyDataSize(
+ kAudioObjectSystemObject, &DEVICES_PROPERTY_ADDRESS, 0, NULL, &size);
+ if (ret != noErr) {
+ return vector<AudioObjectID>();
+@@ -3653,17 +3747,28 @@ audiounit_remove_device_listener(cubeb *
+ context->output_collection_changed_callback) {
+ return noErr;
+ }
+ /* Note: unregister a non registered cb is not a problem, not checking. */
+ return AudioObjectRemovePropertyListener(
+ kAudioObjectSystemObject, &DEVICES_PROPERTY_ADDRESS,
+ audiounit_collection_changed_callback, context);
+ }
+-
++#endif
++
++#if TARGET_OS_IPHONE
++int
++audiounit_register_device_collection_changed(
++ cubeb * context, cubeb_device_type devtype,
++ cubeb_device_collection_changed_callback collection_changed_callback,
++ void * user_ptr)
++{
++ return CUBEB_ERROR_NOT_SUPPORTED;
++}
++#else
+ int
+ audiounit_register_device_collection_changed(
+ cubeb * context, cubeb_device_type devtype,
+ cubeb_device_collection_changed_callback collection_changed_callback,
+ void * user_ptr)
+ {
+ if (devtype == CUBEB_DEVICE_TYPE_UNKNOWN) {
+ return CUBEB_ERROR_INVALID_PARAMETER;
+@@ -3673,16 +3778,17 @@ audiounit_register_device_collection_cha
+ if (collection_changed_callback) {
+ ret = audiounit_add_device_listener(context, devtype,
+ collection_changed_callback, user_ptr);
+ } else {
+ ret = audiounit_remove_device_listener(context, devtype);
+ }
+ return (ret == noErr) ? CUBEB_OK : CUBEB_ERROR;
+ }
++#endif
+
+ cubeb_ops const audiounit_ops = {
+ /*.init =*/audiounit_init,
+ /*.get_backend_id =*/audiounit_get_backend_id,
+ /*.get_max_channel_count =*/audiounit_get_max_channel_count,
+ /*.get_min_latency =*/audiounit_get_min_latency,
+ /*.get_preferred_sample_rate =*/audiounit_get_preferred_sample_rate,
+ /*.get_supported_input_processing_params =*/NULL,
diff --git a/media/libcubeb/0005-aaudio-timing-fix.patch b/media/libcubeb/0005-aaudio-timing-fix.patch
new file mode 100644
index 0000000000..aabaec9c50
--- /dev/null
+++ b/media/libcubeb/0005-aaudio-timing-fix.patch
@@ -0,0 +1,57 @@
+From 19fcbefe1a9c5e22f8111af251df27b41658bc77 Mon Sep 17 00:00:00 2001
+From: John Lin <jolin@mozilla.com>
+Date: Mon, 29 Apr 2024 13:46:57 -0700
+Subject: [PATCH] Invalidate timing info buffers when destorying AAudio stream.
+
+aaudio_stream_get_position() returns incorrect result because
+aaudio_stream_init() recycled destroyed stream where the
+timing_info buffers contain stale data.
+---
+ src/cubeb_aaudio.cpp | 2 ++
+ src/cubeb_triple_buffer.h | 7 +++++++
+ test/test_triple_buffer.cpp | 3 +++
+ 3 files changed, 12 insertions(+)
+
+diff --git a/src/cubeb_aaudio.cpp b/src/cubeb_aaudio.cpp
+index cfae2d6f..8b5eb231 100644
+--- a/src/cubeb_aaudio.cpp
++++ b/src/cubeb_aaudio.cpp
+@@ -1049,6 +1049,8 @@ aaudio_stream_destroy_locked(cubeb_stream * stm, lock_guard<mutex> & lock)
+ stm->istream = nullptr;
+ }
+
++ stm->timing_info.invalidate();
++
+ if (stm->resampler) {
+ cubeb_resampler_destroy(stm->resampler);
+ stm->resampler = nullptr;
+diff --git a/src/cubeb_triple_buffer.h b/src/cubeb_triple_buffer.h
+index a5a5978f..759b92e6 100644
+--- a/src/cubeb_triple_buffer.h
++++ b/src/cubeb_triple_buffer.h
+@@ -42,6 +42,13 @@ template <typename T> class triple_buffer {
+ {
+ return (shared_state.load(std::memory_order_relaxed) & BACK_DIRTY_BIT) != 0;
+ }
++ // Reset state and indices to initial values.
++ void invalidate()
++ {
++ shared_state.store(0, std::memory_order_release);
++ input_idx = 1;
++ output_idx = 2;
++ }
+
+ private:
+ // Publish a value to the consumer. Returns true if the data was overwritten
+diff --git a/test/test_triple_buffer.cpp b/test/test_triple_buffer.cpp
+index a6e0049b..d463c07e 100644
+--- a/test/test_triple_buffer.cpp
++++ b/test/test_triple_buffer.cpp
+@@ -64,4 +64,7 @@ TEST(cubeb, triple_buffer)
+ }
+
+ t.join();
++
++ buffer.invalidate();
++ ASSERT_FALSE(buffer.updated());
+ }
diff --git a/media/libcubeb/moz.yaml b/media/libcubeb/moz.yaml
index d79e64b5eb..3444bdb1d6 100644
--- a/media/libcubeb/moz.yaml
+++ b/media/libcubeb/moz.yaml
@@ -20,6 +20,8 @@ vendoring:
- 0001-disable-aaudio-before-android-31.patch
- 0002-disable-crash-reporter-death-test.patch
- 0003-Only-build-duplex_collection_change_no_unregister-wh.patch
+ - 0004-audiounit-ios-compile-fixes.patch
+ - 0005-aaudio-timing-fix.patch
skip-vendoring-steps:
- update-moz-build
exclude:
diff --git a/media/libcubeb/src/cubeb_aaudio.cpp b/media/libcubeb/src/cubeb_aaudio.cpp
index df19602cd6..c2441bbeef 100644
--- a/media/libcubeb/src/cubeb_aaudio.cpp
+++ b/media/libcubeb/src/cubeb_aaudio.cpp
@@ -1039,6 +1039,8 @@ aaudio_stream_destroy_locked(cubeb_stream * stm, lock_guard<mutex> & lock)
stm->istream = nullptr;
}
+ stm->timing_info.invalidate();
+
if (stm->resampler) {
cubeb_resampler_destroy(stm->resampler);
stm->resampler = nullptr;
diff --git a/media/libcubeb/src/cubeb_audiounit.cpp b/media/libcubeb/src/cubeb_audiounit.cpp
index d823e80ff8..fb15790159 100644
--- a/media/libcubeb/src/cubeb_audiounit.cpp
+++ b/media/libcubeb/src/cubeb_audiounit.cpp
@@ -41,6 +41,15 @@ using namespace std;
typedef UInt32 AudioFormatFlags;
#endif
+#if TARGET_OS_IPHONE
+typedef UInt32 AudioDeviceID;
+typedef UInt32 AudioObjectID;
+const UInt32 kAudioObjectUnknown = 0;
+
+#define AudioGetCurrentHostTime mach_absolute_time
+
+#endif
+
#define AU_OUT_BUS 0
#define AU_IN_BUS 1
@@ -65,6 +74,7 @@ const char * PRIVATE_AGGREGATE_DEVICE_NAME = "CubebAggregateDevice";
LOG(msg, ##__VA_ARGS__); \
})
+#if !TARGET_OS_IPHONE
/* Testing empirically, some headsets report a minimal latency that is very
* low, but this does not work in practice. Lie and say the minimum is 256
* frames. */
@@ -73,27 +83,28 @@ const uint32_t SAFE_MAX_LATENCY_FRAMES = 512;
const AudioObjectPropertyAddress DEFAULT_INPUT_DEVICE_PROPERTY_ADDRESS = {
kAudioHardwarePropertyDefaultInputDevice, kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
const AudioObjectPropertyAddress DEFAULT_OUTPUT_DEVICE_PROPERTY_ADDRESS = {
kAudioHardwarePropertyDefaultOutputDevice, kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
const AudioObjectPropertyAddress DEVICE_IS_ALIVE_PROPERTY_ADDRESS = {
kAudioDevicePropertyDeviceIsAlive, kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
const AudioObjectPropertyAddress DEVICES_PROPERTY_ADDRESS = {
kAudioHardwarePropertyDevices, kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
const AudioObjectPropertyAddress INPUT_DATA_SOURCE_PROPERTY_ADDRESS = {
kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeInput,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
const AudioObjectPropertyAddress OUTPUT_DATA_SOURCE_PROPERTY_ADDRESS = {
kAudioDevicePropertyDataSource, kAudioDevicePropertyScopeOutput,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
+#endif
typedef uint32_t device_flags_value;
@@ -114,22 +125,22 @@ static void
audiounit_close_stream(cubeb_stream * stm);
static int
audiounit_setup_stream(cubeb_stream * stm);
+#if !TARGET_OS_IPHONE
static vector<AudioObjectID>
audiounit_get_devices_of_type(cubeb_device_type devtype);
static UInt32
audiounit_get_device_presentation_latency(AudioObjectID devid,
AudioObjectPropertyScope scope);
-
-#if !TARGET_OS_IPHONE
static AudioObjectID
audiounit_get_default_device_id(cubeb_device_type type);
static int
audiounit_uninstall_device_changed_callback(cubeb_stream * stm);
static int
audiounit_uninstall_system_changed_callback(cubeb_stream * stm);
+#endif
+
static void
audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags);
-#endif
extern cubeb_ops const audiounit_ops;
@@ -144,9 +155,11 @@ struct cubeb {
cubeb_device_collection_changed_callback output_collection_changed_callback =
nullptr;
void * output_collection_changed_user_ptr = nullptr;
+ #if !TARGET_OS_IPHONE
// Store list of devices to detect changes
vector<AudioObjectID> input_device_array;
vector<AudioObjectID> output_device_array;
+ #endif
// The queue should be released when it’s no longer needed.
dispatch_queue_t serial_queue =
dispatch_queue_create(DISPATCH_QUEUE_LABEL, DISPATCH_QUEUE_SERIAL);
@@ -186,6 +199,7 @@ struct device_info {
device_flags_value flags = DEV_UNKNOWN;
};
+#if !TARGET_OS_IPHONE
struct property_listener {
AudioDeviceID device_id;
const AudioObjectPropertyAddress * property_address;
@@ -199,6 +213,7 @@ struct property_listener {
{
}
};
+#endif
struct cubeb_stream {
explicit cubeb_stream(cubeb * context);
@@ -257,22 +272,26 @@ struct cubeb_stream {
/* This is true if a device change callback is currently running. */
atomic<bool> switching_device{false};
atomic<bool> buffer_size_change_state{false};
+ #if !TARGET_OS_IPHONE
AudioDeviceID aggregate_device_id =
kAudioObjectUnknown; // the aggregate device id
AudioObjectID plugin_id =
kAudioObjectUnknown; // used to create aggregate device
+ #endif
/* Mixer interface */
unique_ptr<cubeb_mixer, decltype(&cubeb_mixer_destroy)> mixer;
/* Buffer where remixing/resampling will occur when upmixing is required */
/* Only accessed from callback thread */
unique_ptr<uint8_t[]> temp_buffer;
size_t temp_buffer_size = 0; // size in bytes.
+ #if !TARGET_OS_IPHONE
/* Listeners indicating what system events are monitored. */
unique_ptr<property_listener> default_input_listener;
unique_ptr<property_listener> default_output_listener;
unique_ptr<property_listener> input_alive_listener;
unique_ptr<property_listener> input_source_listener;
unique_ptr<property_listener> output_source_listener;
+ #endif
};
bool
@@ -386,14 +405,6 @@ is_common_sample_rate(Float64 sample_rate)
sample_rate == 88200 || sample_rate == 96000;
}
-#if TARGET_OS_IPHONE
-typedef UInt32 AudioDeviceID;
-typedef UInt32 AudioObjectID;
-
-#define AudioGetCurrentHostTime mach_absolute_time
-
-#endif
-
uint64_t
ConvertHostTimeToNanos(uint64_t host_time)
{
@@ -761,13 +772,13 @@ audiounit_get_backend_id(cubeb * /* ctx */)
return "audiounit";
}
-#if !TARGET_OS_IPHONE
static int
audiounit_stream_get_volume(cubeb_stream * stm, float * volume);
static int
audiounit_stream_set_volume(cubeb_stream * stm, float volume);
+#if !TARGET_OS_IPHONE
static int
audiounit_set_device_info(cubeb_stream * stm, AudioDeviceID id, io_side side)
{
@@ -811,6 +822,7 @@ audiounit_set_device_info(cubeb_stream * stm, AudioDeviceID id, io_side side)
return CUBEB_OK;
}
+#endif
static int
audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
@@ -822,10 +834,13 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
audiounit_stream_stop_internal(stm);
}
- int r = audiounit_uninstall_device_changed_callback(stm);
+ int r;
+#if !TARGET_OS_IPHONE
+ r = audiounit_uninstall_device_changed_callback(stm);
if (r != CUBEB_OK) {
LOG("(%p) Could not uninstall all device change listeners.", stm);
}
+#endif
{
auto_lock lock(stm->mutex);
@@ -837,6 +852,7 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
audiounit_close_stream(stm);
+ #if !TARGET_OS_IPHONE
/* Reinit occurs in one of the following case:
* - When the device is not alive any more
* - When the default system device change.
@@ -866,8 +882,11 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
return CUBEB_ERROR;
}
+ #endif
+
if (audiounit_setup_stream(stm) != CUBEB_OK) {
LOG("(%p) Stream reinit failed.", stm);
+ #if !TARGET_OS_IPHONE
if (flags & DEV_INPUT && input_device != kAudioObjectUnknown) {
// Attempt to re-use the same device-id failed, so attempt again with
// default input device.
@@ -879,6 +898,7 @@ audiounit_reinit_stream(cubeb_stream * stm, device_flags_value flags)
return CUBEB_ERROR;
}
}
+ #endif
}
if (vol_rv == CUBEB_OK) {
@@ -914,9 +934,11 @@ audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags)
}
if (audiounit_reinit_stream(stm, flags) != CUBEB_OK) {
+ #if !TARGET_OS_IPHONE
if (audiounit_uninstall_system_changed_callback(stm) != CUBEB_OK) {
LOG("(%p) Could not uninstall system changed callback", stm);
}
+ #endif
stm->state_callback(stm, stm->user_ptr, CUBEB_STATE_ERROR);
LOG("(%p) Could not reopen the stream after switching.", stm);
}
@@ -925,6 +947,7 @@ audiounit_reinit_stream_async(cubeb_stream * stm, device_flags_value flags)
});
}
+#if !TARGET_OS_IPHONE
static char const *
event_addr_to_string(AudioObjectPropertySelector selector)
{
@@ -1096,6 +1119,7 @@ audiounit_install_device_changed_callback(cubeb_stream * stm)
return r;
}
+#if !TARGET_OS_IPHONE
static int
audiounit_install_system_changed_callback(cubeb_stream * stm)
{
@@ -1136,6 +1160,7 @@ audiounit_install_system_changed_callback(cubeb_stream * stm)
return CUBEB_OK;
}
+#endif
static int
audiounit_uninstall_device_changed_callback(cubeb_stream * stm)
@@ -1212,7 +1237,7 @@ audiounit_get_acceptable_latency_range(AudioValueRange * latency_range)
AudioDeviceID output_device_id;
AudioObjectPropertyAddress output_device_buffer_size_range = {
kAudioDevicePropertyBufferFrameSizeRange, kAudioDevicePropertyScopeOutput,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
if (output_device_id == kAudioObjectUnknown) {
@@ -1233,7 +1258,6 @@ audiounit_get_acceptable_latency_range(AudioValueRange * latency_range)
return CUBEB_OK;
}
-#endif /* !TARGET_OS_IPHONE */
static AudioObjectID
audiounit_get_default_device_id(cubeb_device_type type)
@@ -1256,6 +1280,7 @@ audiounit_get_default_device_id(cubeb_device_type type)
return devid;
}
+#endif /* !TARGET_OS_IPHONE */
int
audiounit_get_max_channel_count(cubeb * ctx, uint32_t * max_channels)
@@ -1270,7 +1295,7 @@ audiounit_get_max_channel_count(cubeb * ctx, uint32_t * max_channels)
AudioStreamBasicDescription stream_format;
AudioObjectPropertyAddress stream_format_address = {
kAudioDevicePropertyStreamFormat, kAudioDevicePropertyScopeOutput,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
assert(ctx && max_channels);
@@ -1309,17 +1334,16 @@ audiounit_get_min_latency(cubeb * /* ctx */, cubeb_stream_params /* params */,
*latency_frames =
max<uint32_t>(latency_range.mMinimum, SAFE_MIN_LATENCY_FRAMES);
-#endif
-
return CUBEB_OK;
+#endif
}
static int
audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate)
{
#if TARGET_OS_IPHONE
- // TODO
- return CUBEB_ERROR_NOT_SUPPORTED;
+ *rate = 44100;
+ return CUBEB_OK;
#else
UInt32 size;
OSStatus r;
@@ -1327,7 +1351,7 @@ audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate)
AudioDeviceID output_device_id;
AudioObjectPropertyAddress samplerate_address = {
kAudioDevicePropertyNominalSampleRate, kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
output_device_id = audiounit_get_default_device_id(CUBEB_DEVICE_TYPE_OUTPUT);
if (output_device_id == kAudioObjectUnknown) {
@@ -1343,8 +1367,9 @@ audiounit_get_preferred_sample_rate(cubeb * /* ctx */, uint32_t * rate)
}
*rate = static_cast<uint32_t>(fsamplerate);
-#endif
+
return CUBEB_OK;
+#endif
}
static cubeb_channel_layout
@@ -1385,6 +1410,9 @@ audiounit_convert_channel_layout(AudioChannelLayout * layout)
static cubeb_channel_layout
audiounit_get_preferred_channel_layout(AudioUnit output_unit)
{
+ #if TARGET_OS_IPHONE
+ return CUBEB_LAYOUT_STEREO;
+ #else
OSStatus rv = noErr;
UInt32 size = 0;
rv = AudioUnitGetPropertyInfo(
@@ -1409,6 +1437,7 @@ audiounit_get_preferred_channel_layout(AudioUnit output_unit)
}
return audiounit_convert_channel_layout(layout.get());
+ #endif
}
static cubeb_channel_layout
@@ -1442,8 +1471,10 @@ audiounit_get_current_channel_layout(AudioUnit output_unit)
static int
audiounit_create_unit(AudioUnit * unit, device_info * device);
+#if !TARGET_OS_IPHONE
static OSStatus
audiounit_remove_device_listener(cubeb * context, cubeb_device_type devtype);
+#endif
static void
audiounit_destroy(cubeb * ctx)
@@ -1465,6 +1496,7 @@ audiounit_destroy(cubeb * ctx)
!ctx->output_collection_changed_callback &&
!ctx->output_collection_changed_user_ptr);
+ #if !TARGET_OS_IPHONE
/* Unregister the callback if necessary. */
if (ctx->input_collection_changed_callback) {
audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_INPUT);
@@ -1472,6 +1504,7 @@ audiounit_destroy(cubeb * ctx)
if (ctx->output_collection_changed_callback) {
audiounit_remove_device_listener(ctx, CUBEB_DEVICE_TYPE_OUTPUT);
}
+ #endif
}
dispatch_release(ctx->serial_queue);
@@ -1599,13 +1632,14 @@ audiounit_layout_init(cubeb_stream * stm, io_side side)
stm->context->layout);
}
+#if !TARGET_OS_IPHONE
static vector<AudioObjectID>
audiounit_get_sub_devices(AudioDeviceID device_id)
{
vector<AudioDeviceID> sub_devices;
AudioObjectPropertyAddress property_address = {
kAudioAggregateDevicePropertyActiveSubDeviceList,
- kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
UInt32 size = 0;
OSStatus rv = AudioObjectGetPropertyDataSize(device_id, &property_address, 0,
nullptr, &size);
@@ -1634,7 +1668,7 @@ audiounit_create_blank_aggregate_device(AudioObjectID * plugin_id,
{
AudioObjectPropertyAddress address_plugin_bundle_id = {
kAudioHardwarePropertyPlugInForBundleID, kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
UInt32 size = 0;
OSStatus r = AudioObjectGetPropertyDataSize(
kAudioObjectSystemObject, &address_plugin_bundle_id, 0, NULL, &size);
@@ -1664,7 +1698,7 @@ audiounit_create_blank_aggregate_device(AudioObjectID * plugin_id,
AudioObjectPropertyAddress create_aggregate_device_address = {
kAudioPlugInCreateAggregateDevice, kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
r = AudioObjectGetPropertyDataSize(
*plugin_id, &create_aggregate_device_address, 0, nullptr, &size);
if (r != noErr) {
@@ -1736,7 +1770,7 @@ get_device_name(AudioDeviceID id)
CFStringRef UIname = nullptr;
AudioObjectPropertyAddress address_uuid = {kAudioDevicePropertyDeviceUID,
kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
OSStatus err =
AudioObjectGetPropertyData(id, &address_uuid, 0, nullptr, &size, &UIname);
return (err == noErr) ? UIname : NULL;
@@ -1779,7 +1813,7 @@ audiounit_set_aggregate_sub_device_list(AudioDeviceID aggregate_device_id,
AudioObjectPropertyAddress aggregate_sub_device_list = {
kAudioAggregateDevicePropertyFullSubDeviceList,
- kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
UInt32 size = sizeof(CFMutableArrayRef);
OSStatus rv = AudioObjectSetPropertyData(
aggregate_device_id, &aggregate_sub_device_list, 0, nullptr, size,
@@ -1801,7 +1835,7 @@ audiounit_set_master_aggregate_device(const AudioDeviceID aggregate_device_id)
assert(aggregate_device_id != kAudioObjectUnknown);
AudioObjectPropertyAddress master_aggregate_sub_device = {
kAudioAggregateDevicePropertyMasterSubDevice,
- kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyElementMain};
// Master become the 1st output sub device
AudioDeviceID output_device_id =
@@ -1834,7 +1868,7 @@ audiounit_activate_clock_drift_compensation(
assert(aggregate_device_id != kAudioObjectUnknown);
AudioObjectPropertyAddress address_owned = {
kAudioObjectPropertyOwnedObjects, kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
UInt32 qualifier_data_size = sizeof(AudioObjectID);
AudioClassID class_id = kAudioSubDeviceClassID;
@@ -1866,7 +1900,7 @@ audiounit_activate_clock_drift_compensation(
AudioObjectPropertyAddress address_drift = {
kAudioSubDevicePropertyDriftCompensation, kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
// Start from the second device since the first is the master clock
for (UInt32 i = 1; i < subdevices_num; ++i) {
@@ -1935,7 +1969,7 @@ audiounit_workaround_for_airpod(cubeb_stream * stm)
Float64 rate = input_nominal_rate;
AudioObjectPropertyAddress addr = {kAudioDevicePropertyNominalSampleRate,
kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
OSStatus rv = AudioObjectSetPropertyData(stm->aggregate_device_id, &addr, 0,
nullptr, sizeof(Float64), &rate);
@@ -2019,7 +2053,7 @@ audiounit_destroy_aggregate_device(AudioObjectID plugin_id,
plugin_id != kAudioObjectUnknown);
AudioObjectPropertyAddress destroy_aggregate_device_addr = {
kAudioPlugInDestroyAggregateDevice, kAudioObjectPropertyScopeGlobal,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
UInt32 size;
OSStatus rv = AudioObjectGetPropertyDataSize(
plugin_id, &destroy_aggregate_device_addr, 0, NULL, &size);
@@ -2042,6 +2076,7 @@ audiounit_destroy_aggregate_device(AudioObjectID plugin_id,
*aggregate_device_id = kAudioObjectUnknown;
return CUBEB_OK;
}
+#endif
static int
audiounit_new_unit_instance(AudioUnit * unit, device_info * device)
@@ -2178,6 +2213,9 @@ audiounit_init_input_linear_buffer(cubeb_stream * stream, uint32_t capacity)
static uint32_t
audiounit_clamp_latency(cubeb_stream * stm, uint32_t latency_frames)
{
+ #if TARGET_OS_IPHONE
+ return latency_frames;
+ #else
// For the 1st stream set anything within safe min-max
assert(audiounit_active_streams(stm->context) > 0);
if (audiounit_active_streams(stm->context) == 1) {
@@ -2238,8 +2276,10 @@ audiounit_clamp_latency(cubeb_stream * stm, uint32_t latency_frames)
return max(min<uint32_t>(latency_frames, upper_latency_limit),
SAFE_MIN_LATENCY_FRAMES);
+ #endif
}
+#if !TARGET_OS_IPHONE
/*
* Change buffer size is prone to deadlock thus we change it
* following the steps:
@@ -2290,11 +2330,15 @@ buffer_size_changed_callback(void * inClientData, AudioUnit inUnit,
}
}
}
+#endif
static int
audiounit_set_buffer_size(cubeb_stream * stm, uint32_t new_size_frames,
io_side side)
{
+ #if TARGET_OS_IPHONE
+ return CUBEB_OK;
+ #else
AudioUnit au = stm->output_unit;
AudioUnitScope au_scope = kAudioUnitScope_Input;
AudioUnitElement au_element = AU_OUT_BUS;
@@ -2382,6 +2426,7 @@ audiounit_set_buffer_size(cubeb_stream * stm, uint32_t new_size_frames,
LOG("(%p) %s buffer size changed to %u frames.", stm, to_string(side),
new_size_frames);
return CUBEB_OK;
+ #endif
}
static int
@@ -2598,6 +2643,7 @@ audiounit_setup_stream(cubeb_stream * stm)
device_info in_dev_info = stm->input_device;
device_info out_dev_info = stm->output_device;
+ #if !TARGET_OS_IPHONE
if (has_input(stm) && has_output(stm) &&
stm->input_device.id != stm->output_device.id) {
r = audiounit_create_aggregate_device(stm);
@@ -2615,6 +2661,10 @@ audiounit_setup_stream(cubeb_stream * stm)
out_dev_info.flags = DEV_OUTPUT;
}
}
+ #else
+ in_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_INPUT;
+ out_dev_info.flags = DEV_SYSTEM_DEFAULT | DEV_OUTPUT;
+ #endif
if (has_input(stm)) {
r = audiounit_create_unit(&stm->input_unit, &in_dev_info);
@@ -2756,8 +2806,10 @@ audiounit_setup_stream(cubeb_stream * stm)
return CUBEB_ERROR;
}
+ #if !TARGET_OS_IPHONE
stm->current_latency_frames = audiounit_get_device_presentation_latency(
stm->output_device.id, kAudioDevicePropertyScopeOutput);
+ #endif
Float64 unit_s;
UInt32 size = sizeof(unit_s);
@@ -2777,10 +2829,12 @@ audiounit_setup_stream(cubeb_stream * stm)
ceilf(stm->output_hw_rate / stm->input_hw_rate);
}
+ #if !TARGET_OS_IPHONE
r = audiounit_install_device_changed_callback(stm);
if (r != CUBEB_OK) {
LOG("(%p) Could not install all device change callback.", stm);
}
+ #endif
return CUBEB_OK;
}
@@ -2828,21 +2882,25 @@ audiounit_stream_init(cubeb * context, cubeb_stream ** stream,
}
if (input_stream_params) {
stm->input_stream_params = *input_stream_params;
+ #if !TARGET_OS_IPHONE
r = audiounit_set_device_info(
stm.get(), reinterpret_cast<uintptr_t>(input_device), io_side::INPUT);
if (r != CUBEB_OK) {
LOG("(%p) Fail to set device info for input.", stm.get());
return r;
}
+ #endif
}
if (output_stream_params) {
stm->output_stream_params = *output_stream_params;
+ #if !TARGET_OS_IPHONE
r = audiounit_set_device_info(
stm.get(), reinterpret_cast<uintptr_t>(output_device), io_side::OUTPUT);
if (r != CUBEB_OK) {
LOG("(%p) Fail to set device info for output.", stm.get());
return r;
}
+ #endif
}
{
@@ -2858,11 +2916,13 @@ audiounit_stream_init(cubeb * context, cubeb_stream ** stream,
return r;
}
+ #if !TARGET_OS_IPHONE
r = audiounit_install_system_changed_callback(stm.get());
if (r != CUBEB_OK) {
LOG("(%p) Could not install the device change callback.", stm.get());
return r;
}
+ #endif
*stream = stm.release();
LOG("(%p) Cubeb stream init successful.", *stream);
@@ -2891,11 +2951,13 @@ audiounit_close_stream(cubeb_stream * stm)
stm->resampler.reset();
stm->mixer.reset();
+ #if !TARGET_OS_IPHONE
if (stm->aggregate_device_id != kAudioObjectUnknown) {
audiounit_destroy_aggregate_device(stm->plugin_id,
&stm->aggregate_device_id);
stm->aggregate_device_id = kAudioObjectUnknown;
}
+ #endif
}
static void
@@ -2903,6 +2965,7 @@ audiounit_stream_destroy_internal(cubeb_stream * stm)
{
stm->context->mutex.assert_current_thread_owns();
+#if !TARGET_OS_IPHONE
int r = audiounit_uninstall_system_changed_callback(stm);
if (r != CUBEB_OK) {
LOG("(%p) Could not uninstall the device changed callback", stm);
@@ -2911,6 +2974,7 @@ audiounit_stream_destroy_internal(cubeb_stream * stm)
if (r != CUBEB_OK) {
LOG("(%p) Could not uninstall all device change listeners", stm);
}
+#endif
auto_lock lock(stm->mutex);
audiounit_close_stream(stm);
@@ -2921,6 +2985,7 @@ audiounit_stream_destroy_internal(cubeb_stream * stm)
static void
audiounit_stream_destroy(cubeb_stream * stm)
{
+ #if !TARGET_OS_IPHONE
int r = audiounit_uninstall_system_changed_callback(stm);
if (r != CUBEB_OK) {
LOG("(%p) Could not uninstall the device changed callback", stm);
@@ -2929,6 +2994,7 @@ audiounit_stream_destroy(cubeb_stream * stm)
if (r != CUBEB_OK) {
LOG("(%p) Could not uninstall all device change listeners", stm);
}
+ #endif
if (!stm->shutdown.load()) {
auto_lock context_lock(stm->context->mutex);
@@ -3086,6 +3152,7 @@ convert_uint32_into_string(UInt32 data)
return str;
}
+#if !TARGET_OS_IPHONE
int
audiounit_get_default_device_datasource(cubeb_device_type type, UInt32 * data)
{
@@ -3107,12 +3174,16 @@ audiounit_get_default_device_datasource(cubeb_device_type type, UInt32 * data)
return CUBEB_OK;
}
+#endif
int
audiounit_get_default_device_name(cubeb_stream * stm,
cubeb_device * const device,
cubeb_device_type type)
{
+#if TARGET_OS_IPHONE
+ return CUBEB_ERROR_NOT_SUPPORTED;
+#else
assert(stm);
assert(device);
@@ -3129,6 +3200,7 @@ audiounit_get_default_device_name(cubeb_stream * stm,
type == CUBEB_DEVICE_TYPE_INPUT ? "input" : "output");
}
return CUBEB_OK;
+ #endif
}
int
@@ -3183,6 +3255,7 @@ audiounit_stream_register_device_changed_callback(
return CUBEB_OK;
}
+#if !TARGET_OS_IPHONE
static char *
audiounit_strref_to_cstr_utf8(CFStringRef strref)
{
@@ -3204,12 +3277,14 @@ audiounit_strref_to_cstr_utf8(CFStringRef strref)
return ret;
}
+#endif
+#if !TARGET_OS_IPHONE
static uint32_t
audiounit_get_channel_count(AudioObjectID devid, AudioObjectPropertyScope scope)
{
AudioObjectPropertyAddress adr = {0, scope,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
UInt32 size = 0;
uint32_t i, ret = 0;
@@ -3235,7 +3310,7 @@ audiounit_get_available_samplerate(AudioObjectID devid,
uint32_t * def)
{
AudioObjectPropertyAddress adr = {0, scope,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
adr.mSelector = kAudioDevicePropertyNominalSampleRate;
if (AudioObjectHasProperty(devid, &adr)) {
@@ -3277,7 +3352,7 @@ audiounit_get_device_presentation_latency(AudioObjectID devid,
AudioObjectPropertyScope scope)
{
AudioObjectPropertyAddress adr = {0, scope,
- kAudioObjectPropertyElementMaster};
+ kAudioObjectPropertyElementMain};
UInt32 size, dev, stream = 0;
AudioStreamID sid[1];
@@ -3302,7 +3377,7 @@ static int
audiounit_create_device_from_hwdev(cubeb_device_info * dev_info,
AudioObjectID devid, cubeb_device_type type)
{
- AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMaster};
+ AudioObjectPropertyAddress adr = {0, 0, kAudioObjectPropertyElementMain};
UInt32 size;
if (type == CUBEB_DEVICE_TYPE_OUTPUT) {
@@ -3313,7 +3388,11 @@ audiounit_create_device_from_hwdev(cubeb_device_info * dev_info,
return CUBEB_ERROR;
}
+ #if TARGET_OS_IPHONE
+ UINT32 ch = 2;
+ #else
UInt32 ch = audiounit_get_channel_count(devid, adr.mScope);
+ #endif
if (ch == 0) {
return CUBEB_ERROR;
}
@@ -3417,7 +3496,16 @@ is_aggregate_device(cubeb_device_info * device_info)
return !strncmp(device_info->friendly_name, PRIVATE_AGGREGATE_DEVICE_NAME,
strlen(PRIVATE_AGGREGATE_DEVICE_NAME));
}
+#endif
+#if TARGET_OS_IPHONE
+static int
+audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type,
+ cubeb_device_collection * collection)
+{
+ return CUBEB_ERROR_NOT_SUPPORTED;
+}
+#else
static int
audiounit_enumerate_devices(cubeb * /* context */, cubeb_device_type type,
cubeb_device_collection * collection)
@@ -3483,19 +3571,25 @@ audiounit_device_destroy(cubeb_device_info * device)
delete[] device->friendly_name;
delete[] device->vendor_name;
}
+#endif
static int
audiounit_device_collection_destroy(cubeb * /* context */,
cubeb_device_collection * collection)
{
+ #if TARGET_OS_IPHONE
+ return CUBEB_ERROR_NOT_SUPPORTED;
+ #else
for (size_t i = 0; i < collection->count; i++) {
audiounit_device_destroy(&collection->device[i]);
}
delete[] collection->device;
return CUBEB_OK;
+ #endif
}
+#if !TARGET_OS_IPHONE
static vector<AudioObjectID>
audiounit_get_devices_of_type(cubeb_device_type devtype)
{
@@ -3658,7 +3752,18 @@ audiounit_remove_device_listener(cubeb * context, cubeb_device_type devtype)
kAudioObjectSystemObject, &DEVICES_PROPERTY_ADDRESS,
audiounit_collection_changed_callback, context);
}
+#endif
+#if TARGET_OS_IPHONE
+int
+audiounit_register_device_collection_changed(
+ cubeb * context, cubeb_device_type devtype,
+ cubeb_device_collection_changed_callback collection_changed_callback,
+ void * user_ptr)
+{
+ return CUBEB_ERROR_NOT_SUPPORTED;
+}
+#else
int
audiounit_register_device_collection_changed(
cubeb * context, cubeb_device_type devtype,
@@ -3678,6 +3783,7 @@ audiounit_register_device_collection_changed(
}
return (ret == noErr) ? CUBEB_OK : CUBEB_ERROR;
}
+#endif
cubeb_ops const audiounit_ops = {
/*.init =*/audiounit_init,
diff --git a/media/libcubeb/src/cubeb_triple_buffer.h b/media/libcubeb/src/cubeb_triple_buffer.h
index a5a5978fb4..759b92e62b 100644
--- a/media/libcubeb/src/cubeb_triple_buffer.h
+++ b/media/libcubeb/src/cubeb_triple_buffer.h
@@ -42,6 +42,13 @@ public:
{
return (shared_state.load(std::memory_order_relaxed) & BACK_DIRTY_BIT) != 0;
}
+ // Reset state and indices to initial values.
+ void invalidate()
+ {
+ shared_state.store(0, std::memory_order_release);
+ input_idx = 1;
+ output_idx = 2;
+ }
private:
// Publish a value to the consumer. Returns true if the data was overwritten
diff --git a/media/libcubeb/src/moz.build b/media/libcubeb/src/moz.build
index d7d05b5867..46a89c4063 100644
--- a/media/libcubeb/src/moz.build
+++ b/media/libcubeb/src/moz.build
@@ -74,8 +74,8 @@ if CONFIG['MOZ_AUDIOUNIT_RUST']:
SOURCES += [
'cubeb_osx_run_loop.c',
]
+ DEFINES['USE_AUDIOUNIT_RUST'] = True
DEFINES['USE_AUDIOUNIT'] = True
- DEFINES['USE_AUDIOUNIT_RUST'] = True
if CONFIG['MOZ_WASAPI']:
SOURCES += [
diff --git a/media/libcubeb/test/test_triple_buffer.cpp b/media/libcubeb/test/test_triple_buffer.cpp
index a6e0049b79..d463c07e03 100644
--- a/media/libcubeb/test/test_triple_buffer.cpp
+++ b/media/libcubeb/test/test_triple_buffer.cpp
@@ -64,4 +64,7 @@ TEST(cubeb, triple_buffer)
}
t.join();
+
+ buffer.invalidate();
+ ASSERT_FALSE(buffer.updated());
}
diff --git a/media/libdav1d/config.h b/media/libdav1d/config.h
index 218c8ae7f4..c7bdc7defc 100644
--- a/media/libdav1d/config.h
+++ b/media/libdav1d/config.h
@@ -46,7 +46,10 @@
// Those values are copied from the auto generated
// config file produced by stand alone dav1d build.
# define HAVE_AS_FUNC 0
+// Build with <sys/auxv.h> header only on Linux-specific systems.
+#if defined(__linux__)
# define HAVE_GETAUXVAL 1
+#endif
# define PIC 3
#endif
diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml
index 22994fc7bf..ca526ea688 100644
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
- release: 4796b59fc0a459588183dc2ea199ba1074befc67 (2024-02-18T15:37:04.000+01:00).
+ release: 8e08426468a76d8a667e8a79d92bafd85d7411ac (2024-03-18T20:50:37.000+00:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
- revision: 4796b59fc0a459588183dc2ea199ba1074befc67
+ revision: 8e08426468a76d8a667e8a79d92bafd85d7411ac
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/
diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h
index 1ac3f3ded3..af1770d5bd 100644
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@@ -1,2 +1,2 @@
/* auto-generated, do not edit */
-#define DAV1D_VERSION "4796b59fc0a459588183dc2ea199ba1074befc67"
+#define DAV1D_VERSION "8e08426468a76d8a667e8a79d92bafd85d7411ac"
diff --git a/media/libjxl/moz.yaml b/media/libjxl/moz.yaml
index 7b8d187ff4..ddf34a3dc9 100644
--- a/media/libjxl/moz.yaml
+++ b/media/libjxl/moz.yaml
@@ -10,9 +10,9 @@ origin:
url: https://github.com/libjxl/libjxl
- release: f06a34c77b1bd11bafbe82989241e68c756ccca2 (2024-03-11T15:14:53Z).
+ release: a5e4aa1fc1fe5bee252225a2616dccde7fd35da0 (2024-04-01T20:09:39Z).
- revision: f06a34c77b1bd11bafbe82989241e68c756ccca2
+ revision: a5e4aa1fc1fe5bee252225a2616dccde7fd35da0
license: Apache-2.0
diff --git a/media/libopus/celt/arm/armcpu.c b/media/libopus/celt/arm/armcpu.c
index 06a53435b8..6785121ac9 100644
--- a/media/libopus/celt/arm/armcpu.c
+++ b/media/libopus/celt/arm/armcpu.c
@@ -96,7 +96,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
/* Linux based */
#include <stdio.h>
-opus_uint32 opus_cpu_capabilities(void)
+static opus_uint32 opus_cpu_capabilities(void)
{
opus_uint32 flags = 0;
FILE *cpuinfo;
@@ -169,7 +169,7 @@ opus_uint32 opus_cpu_capabilities(void)
#include <sys/types.h>
#include <sys/sysctl.h>
-opus_uint32 opus_cpu_capabilities(void)
+static opus_uint32 opus_cpu_capabilities(void)
{
opus_uint32 flags = 0;
@@ -191,6 +191,54 @@ opus_uint32 opus_cpu_capabilities(void)
return flags;
}
+#elif defined(__FreeBSD__)
+#include <sys/auxv.h>
+
+static opus_uint32 opus_cpu_capabilities(void)
+{
+ long hwcap = 0;
+ opus_uint32 flags = 0;
+
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+ /* FreeBSD requires armv6+, which always supports media instructions */
+ flags |= OPUS_CPU_ARM_MEDIA_FLAG;
+# endif
+
+ elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
+
+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# ifdef HWCAP_EDSP
+ if (hwcap & HWCAP_EDSP)
+ flags |= OPUS_CPU_ARM_EDSP_FLAG;
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# ifdef HWCAP_NEON
+ if (hwcap & HWCAP_NEON)
+ flags |= OPUS_CPU_ARM_NEON_FLAG;
+# elif defined(HWCAP_ASIMD)
+ if (hwcap & HWCAP_ASIMD)
+ flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
+# endif
+# endif
+# if defined(OPUS_ARM_MAY_HAVE_DOTPROD) && defined(HWCAP_ASIMDDP)
+ if (hwcap & HWCAP_ASIMDDP)
+ flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+# endif
+
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+ flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+ flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+
+ return (flags);
+}
+
#else
/* The feature registers which can tell us what the processor supports are
* accessible in priveleged modes only, so we can't have a general user-space
diff --git a/media/libopus/celt/x86/x86cpu.h b/media/libopus/celt/x86/x86cpu.h
index 8ae9be8d8f..1e5b6a4cb3 100644
--- a/media/libopus/celt/x86/x86cpu.h
+++ b/media/libopus/celt/x86/x86cpu.h
@@ -68,8 +68,22 @@ int opus_select_arch(void);
Use this to work around those restrictions (which should hopefully all get
optimized to a single MOVD instruction).
GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug!
- https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */
-# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8))
+ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754
+ LLVM implemented _mm_loadu_si32() since Clang 8.0, however the
+ __clang_major__ version number macro is unreliable, as vendors
+ (specifically, Apple) will use different numbering schemes than upstream.
+ Clang's advice is "use feature detection", but they do not provide feature
+ detection support for specific SIMD functions.
+ We follow the approach from the SIMDe project and instead detect unrelated
+ features that should be available in the version we want (see
+ <https://github.com/simd-everywhere/simde/blob/master/simde/simde-detect-clang.h>).*/
+# if defined(__clang__)
+# if __has_warning("-Wextra-semi-stmt") || \
+ __has_builtin(__builtin_rotateleft32)
+# define OPUS_CLANG_8 (1)
+# endif
+# endif
+# if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !defined(OPUS_CLANG_8)
# include <string.h>
# include <emmintrin.h>
diff --git a/media/libopus/moz.build b/media/libopus/moz.build
index 44c0ab7c90..c5b2021ba7 100644
--- a/media/libopus/moz.build
+++ b/media/libopus/moz.build
@@ -21,7 +21,7 @@ FINAL_LIBRARY = "gkcodecs"
NoVisibilityFlags()
DEFINES["OPUS_BUILD"] = True
-DEFINES["OPUS_VERSION"] = "ab4e83598e7fc8b2ce82dc633a0fc0c452b629aa"
+DEFINES["OPUS_VERSION"] = "fdb198e88660721e289df94c29e91f70caff787e"
DEFINES["USE_ALLOCA"] = True
DEFINES["ENABLE_HARDENING"] = True
diff --git a/media/libopus/moz.yaml b/media/libopus/moz.yaml
index ed76d36d1f..7728a66c41 100644
--- a/media/libopus/moz.yaml
+++ b/media/libopus/moz.yaml
@@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
- release: ab4e83598e7fc8b2ce82dc633a0fc0c452b629aa (2024-03-04T11:53:07.000-05:00).
+ release: fdb198e88660721e289df94c29e91f70caff787e (2024-04-09T14:29:12.000-04:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
- revision: ab4e83598e7fc8b2ce82dc633a0fc0c452b629aa
+ revision: fdb198e88660721e289df94c29e91f70caff787e
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/
diff --git a/media/libopus/silk/x86/NSQ_del_dec_avx2.c b/media/libopus/silk/x86/NSQ_del_dec_avx2.c
index 43485871a4..21f00c2dad 100644
--- a/media/libopus/silk/x86/NSQ_del_dec_avx2.c
+++ b/media/libopus/silk/x86/NSQ_del_dec_avx2.c
@@ -73,7 +73,6 @@ static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)
/* Intrinsics not defined on MSVC */
#ifdef _MSC_VER
#include <Intsafe.h>
-#define __m128i_u __m128i
static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)
{
*res = a+b;
@@ -959,7 +958,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
{
__m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));
x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);
- _mm_storeu_si128((__m128i_u*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
+ _mm_storeu_si128((__m128i*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
}
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
@@ -985,8 +984,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
/* Scale long-term shaping state */
for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)
{
- __m128i_u* p = (__m128i_u*)&NSQ->sLTP_shp_Q14[i];
- *p = silk_mm_smulww_epi32(*p, gain_adj_Q16);
+ opus_int32 *p = &NSQ->sLTP_shp_Q14[i];
+ _mm_storeu_si128((__m128i*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)p), gain_adj_Q16));
}
/* Scale long-term prediction state */
@@ -1041,13 +1040,13 @@ static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
/* Allowing wrap around so that two wraps can cancel each other. The rare
cases where the result wraps around can only be triggered by invalid streams*/
- __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-8]));
- __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)& B[0]));
+ __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-8]));
+ __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)& B[0]));
__m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v));
if (order > 10)
{
- in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&in_ptr[-16]));
- B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i_u*)&B [8]));
+ in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&in_ptr[-16]));
+ B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&B [8]));
B_v = silk_mm256_reverse_epi32(B_v);
}
else
diff --git a/media/libopus/src/opus_private.h b/media/libopus/src/opus_private.h
index 364c21cebc..279f5f95f6 100644
--- a/media/libopus/src/opus_private.h
+++ b/media/libopus/src/opus_private.h
@@ -214,7 +214,7 @@ int opus_multistream_decode_native(
opus_int32 opus_packet_extensions_parse(const unsigned char *data, opus_int32 len, opus_extension_data *extensions, opus_int32 *nb_extensions);
-opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data *extensions, int nb_extensions, int pad);
+opus_int32 opus_packet_extensions_generate(unsigned char *data, opus_int32 len, const opus_extension_data *extensions, opus_int32 nb_extensions, int pad);
opus_int32 opus_packet_extensions_count(const unsigned char *data, opus_int32 len);
diff --git a/media/libopus/src/repacketizer.c b/media/libopus/src/repacketizer.c
index 6a7a8b3d8e..79798b0217 100644
--- a/media/libopus/src/repacketizer.c
+++ b/media/libopus/src/repacketizer.c
@@ -155,7 +155,8 @@ opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int
/* incorporate any extensions from the repacketizer padding */
for (i=begin;i<end;i++)
{
- int frame_ext_count, j;
+ int j;
+ opus_int32 frame_ext_count;
frame_ext_count = total_ext_count - ext_count;
int ret = opus_packet_extensions_parse(rp->paddings[i], rp->padding_len[i],
&all_extensions[ext_count], &frame_ext_count);
diff --git a/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch b/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch
new file mode 100644
index 0000000000..4788b3996a
--- /dev/null
+++ b/media/libvpx/arm_cpu_runtime_detection_code_on_openbsd.patch
@@ -0,0 +1,41 @@
+# HG changeset patch
+# User Chun-Min Chang <chun.m.chang@gmail.com>
+
+Bug 1888772 - Allow ARM CPU runtime detection code to build on OpenBSD
+
+diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c
+--- a/vpx_ports/aarch64_cpudetect.c
++++ b/vpx_ports/aarch64_cpudetect.c
+@@ -10,30 +10,30 @@
+
+ #include "./vpx_config.h"
+ #include "arm_cpudetect.h"
+
+ #if defined(__APPLE__)
+ #include <sys/sysctl.h>
+ #endif
+
+-#if !CONFIG_RUNTIME_CPU_DETECT
++#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
+
+ static int arm_get_cpu_caps(void) {
+ // This function should actually be a no-op. There is no way to adjust any of
+ // these because the RTCD tables do not exist: the functions are called
+ // statically.
+ int flags = 0;
+ #if HAVE_NEON
+ flags |= HAS_NEON;
+ #endif // HAVE_NEON
+ return flags;
+ }
+
+-#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT
++#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
+
+ // sysctlbyname() parameter documentation for instruction set characteristics:
+ // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+ static INLINE int64_t have_feature(const char *feature) {
+ int64_t feature_present = 0;
+ size_t size = sizeof(feature_present);
+ if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
+ return 0;
diff --git a/media/libvpx/config/generic/vpx_config.asm b/media/libvpx/config/generic/vpx_config.asm
index 47243ad198..7a1aaf999a 100644
--- a/media/libvpx/config/generic/vpx_config.asm
+++ b/media/libvpx/config/generic/vpx_config.asm
@@ -13,6 +13,7 @@
.equ HAVE_NEON_DOTPROD , 0
.equ HAVE_NEON_I8MM , 0
.equ HAVE_SVE , 0
+.equ HAVE_SVE2 , 0
.equ HAVE_MIPS32 , 0
.equ HAVE_DSPR2 , 0
.equ HAVE_MSA , 0
diff --git a/media/libvpx/config/generic/vpx_config.c b/media/libvpx/config/generic/vpx_config.c
index d1c3d1acd7..922edd1ea2 100644
--- a/media/libvpx/config/generic/vpx_config.c
+++ b/media/libvpx/config/generic/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=generic-gnu --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512";
+static const char* const cfg = "--target=generic-gnu --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --log=/home/cm/Work/gecko-dev/media/libvpx/config/generic/config.log";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/generic/vpx_config.h b/media/libvpx/config/generic/vpx_config.h
index 774a531ed9..c885bb399a 100644
--- a/media/libvpx/config/generic/vpx_config.h
+++ b/media/libvpx/config/generic/vpx_config.h
@@ -22,6 +22,7 @@
#define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_MIPS32 0
#define HAVE_DSPR2 0
#define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/arm/vpx_config.asm b/media/libvpx/config/linux/arm/vpx_config.asm
index ee43d0f922..6be2a7f7a2 100644
--- a/media/libvpx/config/linux/arm/vpx_config.asm
+++ b/media/libvpx/config/linux/arm/vpx_config.asm
@@ -13,6 +13,7 @@
.equ HAVE_NEON_DOTPROD , 0
.equ HAVE_NEON_I8MM , 0
.equ HAVE_SVE , 0
+.equ HAVE_SVE2 , 0
.equ HAVE_MIPS32 , 0
.equ HAVE_DSPR2 , 0
.equ HAVE_MSA , 0
diff --git a/media/libvpx/config/linux/arm/vpx_config.c b/media/libvpx/config/linux/arm/vpx_config.c
index c885d910c0..c634e2af66 100644
--- a/media/libvpx/config/linux/arm/vpx_config.c
+++ b/media/libvpx/config/linux/arm/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-runtime-cpu-detect --enable-realtime-only";
+static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-runtime-cpu-detect --enable-realtime-only --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/arm/config.log";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/arm/vpx_config.h b/media/libvpx/config/linux/arm/vpx_config.h
index bfd2c04e07..99a55f0ea9 100644
--- a/media/libvpx/config/linux/arm/vpx_config.h
+++ b/media/libvpx/config/linux/arm/vpx_config.h
@@ -22,6 +22,7 @@
#define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_MIPS32 0
#define HAVE_DSPR2 0
#define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/arm64/vp9_rtcd.h b/media/libvpx/config/linux/arm64/vp9_rtcd.h
index 738de4f9f4..b7d828d446 100644
--- a/media/libvpx/config/linux/arm64/vp9_rtcd.h
+++ b/media/libvpx/config/linux/arm64/vp9_rtcd.h
@@ -35,11 +35,13 @@ extern "C" {
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_neon
+int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_neon
+int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
@@ -96,6 +98,10 @@ static void setup_rtcd_internal(void)
(void)flags;
+ vp9_block_error = vp9_block_error_neon;
+ if (flags & HAS_SVE) vp9_block_error = vp9_block_error_sve;
+ vp9_block_error_fp = vp9_block_error_fp_neon;
+ if (flags & HAS_SVE) vp9_block_error_fp = vp9_block_error_fp_sve;
}
#endif
diff --git a/media/libvpx/config/linux/arm64/vpx_config.asm b/media/libvpx/config/linux/arm64/vpx_config.asm
index 499c16202c..c51a76b3f6 100644
--- a/media/libvpx/config/linux/arm64/vpx_config.asm
+++ b/media/libvpx/config/linux/arm64/vpx_config.asm
@@ -13,6 +13,7 @@
.equ HAVE_NEON_DOTPROD , 1
.equ HAVE_NEON_I8MM , 1
.equ HAVE_SVE , 1
+.equ HAVE_SVE2 , 1
.equ HAVE_MIPS32 , 0
.equ HAVE_DSPR2 , 0
.equ HAVE_MSA , 0
diff --git a/media/libvpx/config/linux/arm64/vpx_config.c b/media/libvpx/config/linux/arm64/vpx_config.c
index 74baa0689c..c0d714503f 100644
--- a/media/libvpx/config/linux/arm64/vpx_config.c
+++ b/media/libvpx/config/linux/arm64/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only";
+static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/arm64/config.log";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/arm64/vpx_config.h b/media/libvpx/config/linux/arm64/vpx_config.h
index 3c5f2e33ca..12251ee0c1 100644
--- a/media/libvpx/config/linux/arm64/vpx_config.h
+++ b/media/libvpx/config/linux/arm64/vpx_config.h
@@ -22,6 +22,7 @@
#define HAVE_NEON_DOTPROD 1
#define HAVE_NEON_I8MM 1
#define HAVE_SVE 1
+#define HAVE_SVE2 1
#define HAVE_MIPS32 0
#define HAVE_DSPR2 0
#define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h
index 5a9b05ca14..2c31ee4ef9 100644
--- a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h
+++ b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h
@@ -916,7 +916,8 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t di
uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size);
-#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_neon
+uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size);
+RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size);
void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
@@ -1148,6 +1149,8 @@ static void setup_rtcd_internal(void)
if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon_dotprod;
vpx_sse = vpx_sse_neon;
if (flags & HAS_NEON_DOTPROD) vpx_sse = vpx_sse_neon_dotprod;
+ vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon;
+ if (flags & HAS_SVE) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sve;
vpx_variance16x16 = vpx_variance16x16_neon;
if (flags & HAS_NEON_DOTPROD) vpx_variance16x16 = vpx_variance16x16_neon_dotprod;
vpx_variance16x32 = vpx_variance16x32_neon;
diff --git a/media/libvpx/config/linux/ia32/vpx_config.asm b/media/libvpx/config/linux/ia32/vpx_config.asm
index eaa3950d37..5a92abf939 100644
--- a/media/libvpx/config/linux/ia32/vpx_config.asm
+++ b/media/libvpx/config/linux/ia32/vpx_config.asm
@@ -10,6 +10,7 @@
%define HAVE_NEON_DOTPROD 0
%define HAVE_NEON_I8MM 0
%define HAVE_SVE 0
+%define HAVE_SVE2 0
%define HAVE_MIPS32 0
%define HAVE_DSPR2 0
%define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/ia32/vpx_config.c b/media/libvpx/config/linux/ia32/vpx_config.c
index 6805ab62a8..7024ca989f 100644
--- a/media/libvpx/config/linux/ia32/vpx_config.c
+++ b/media/libvpx/config/linux/ia32/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/ia32/config.log";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/ia32/vpx_config.h b/media/libvpx/config/linux/ia32/vpx_config.h
index 69fd63bf02..b4cc10a906 100644
--- a/media/libvpx/config/linux/ia32/vpx_config.h
+++ b/media/libvpx/config/linux/ia32/vpx_config.h
@@ -22,6 +22,7 @@
#define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_MIPS32 0
#define HAVE_DSPR2 0
#define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/x64/vpx_config.asm b/media/libvpx/config/linux/x64/vpx_config.asm
index 8715768a2e..148a894979 100644
--- a/media/libvpx/config/linux/x64/vpx_config.asm
+++ b/media/libvpx/config/linux/x64/vpx_config.asm
@@ -10,6 +10,7 @@
%define HAVE_NEON_DOTPROD 0
%define HAVE_NEON_I8MM 0
%define HAVE_SVE 0
+%define HAVE_SVE2 0
%define HAVE_MIPS32 0
%define HAVE_DSPR2 0
%define HAVE_MSA 0
diff --git a/media/libvpx/config/linux/x64/vpx_config.c b/media/libvpx/config/linux/x64/vpx_config.c
index e4dcb394c3..f38bd16290 100644
--- a/media/libvpx/config/linux/x64/vpx_config.c
+++ b/media/libvpx/config/linux/x64/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86_64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86_64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/linux/x64/config.log";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/x64/vpx_config.h b/media/libvpx/config/linux/x64/vpx_config.h
index ab4439aaf4..d91509ad10 100644
--- a/media/libvpx/config/linux/x64/vpx_config.h
+++ b/media/libvpx/config/linux/x64/vpx_config.h
@@ -22,6 +22,7 @@
#define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_MIPS32 0
#define HAVE_DSPR2 0
#define HAVE_MSA 0
diff --git a/media/libvpx/config/mac/ia32/vpx_config.asm b/media/libvpx/config/mac/ia32/vpx_config.asm
index eaa3950d37..5a92abf939 100644
--- a/media/libvpx/config/mac/ia32/vpx_config.asm
+++ b/media/libvpx/config/mac/ia32/vpx_config.asm
@@ -10,6 +10,7 @@
%define HAVE_NEON_DOTPROD 0
%define HAVE_NEON_I8MM 0
%define HAVE_SVE 0
+%define HAVE_SVE2 0
%define HAVE_MIPS32 0
%define HAVE_DSPR2 0
%define HAVE_MSA 0
diff --git a/media/libvpx/config/mac/ia32/vpx_config.c b/media/libvpx/config/mac/ia32/vpx_config.c
index 3e5d3ec0f3..2ee9d0ebb0 100644
--- a/media/libvpx/config/mac/ia32/vpx_config.c
+++ b/media/libvpx/config/mac/ia32/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/mac/ia32/config.log";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/mac/ia32/vpx_config.h b/media/libvpx/config/mac/ia32/vpx_config.h
index 69fd63bf02..b4cc10a906 100644
--- a/media/libvpx/config/mac/ia32/vpx_config.h
+++ b/media/libvpx/config/mac/ia32/vpx_config.h
@@ -22,6 +22,7 @@
#define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_MIPS32 0
#define HAVE_DSPR2 0
#define HAVE_MSA 0
diff --git a/media/libvpx/config/mac/x64/vpx_config.asm b/media/libvpx/config/mac/x64/vpx_config.asm
index 8715768a2e..148a894979 100644
--- a/media/libvpx/config/mac/x64/vpx_config.asm
+++ b/media/libvpx/config/mac/x64/vpx_config.asm
@@ -10,6 +10,7 @@
%define HAVE_NEON_DOTPROD 0
%define HAVE_NEON_I8MM 0
%define HAVE_SVE 0
+%define HAVE_SVE2 0
%define HAVE_MIPS32 0
%define HAVE_DSPR2 0
%define HAVE_MSA 0
diff --git a/media/libvpx/config/mac/x64/vpx_config.c b/media/libvpx/config/mac/x64/vpx_config.c
index 9a06646fdc..51fceeb6e3 100644
--- a/media/libvpx/config/mac/x64/vpx_config.c
+++ b/media/libvpx/config/mac/x64/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86_64-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86_64-darwin9-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/mac/x64/config.log";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/mac/x64/vpx_config.h b/media/libvpx/config/mac/x64/vpx_config.h
index ab4439aaf4..d91509ad10 100644
--- a/media/libvpx/config/mac/x64/vpx_config.h
+++ b/media/libvpx/config/mac/x64/vpx_config.h
@@ -22,6 +22,7 @@
#define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_MIPS32 0
#define HAVE_DSPR2 0
#define HAVE_MSA 0
diff --git a/media/libvpx/config/win/aarch64/vpx_config.asm b/media/libvpx/config/win/aarch64/vpx_config.asm
index 24eb1a8cba..32d700f1bb 100644
--- a/media/libvpx/config/win/aarch64/vpx_config.asm
+++ b/media/libvpx/config/win/aarch64/vpx_config.asm
@@ -12,7 +12,8 @@
.equ HAVE_NEON , 1
.equ HAVE_NEON_DOTPROD , 1
.equ HAVE_NEON_I8MM , 1
-.equ HAVE_SVE , 1
+.equ HAVE_SVE , 0
+.equ HAVE_SVE2 , 0
.equ HAVE_MIPS32 , 0
.equ HAVE_DSPR2 , 0
.equ HAVE_MSA , 0
diff --git a/media/libvpx/config/win/aarch64/vpx_config.c b/media/libvpx/config/win/aarch64/vpx_config.c
index 13cc13a95d..b8f4ec8754 100644
--- a/media/libvpx/config/win/aarch64/vpx_config.c
+++ b/media/libvpx/config/win/aarch64/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=arm64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only";
+static const char* const cfg = "--target=arm64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-realtime-only --disable-sve --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/aarch64/config.log";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/win/aarch64/vpx_config.h b/media/libvpx/config/win/aarch64/vpx_config.h
index c3cc860f18..a81f868053 100644
--- a/media/libvpx/config/win/aarch64/vpx_config.h
+++ b/media/libvpx/config/win/aarch64/vpx_config.h
@@ -21,7 +21,8 @@
#define HAVE_NEON 1
#define HAVE_NEON_DOTPROD 1
#define HAVE_NEON_I8MM 1
-#define HAVE_SVE 1
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_MIPS32 0
#define HAVE_DSPR2 0
#define HAVE_MSA 0
diff --git a/media/libvpx/config/win/ia32/vpx_config.asm b/media/libvpx/config/win/ia32/vpx_config.asm
index cb1aa7ce6a..9c7e3ce2c2 100755
--- a/media/libvpx/config/win/ia32/vpx_config.asm
+++ b/media/libvpx/config/win/ia32/vpx_config.asm
@@ -10,6 +10,7 @@
%define HAVE_NEON_DOTPROD 0
%define HAVE_NEON_I8MM 0
%define HAVE_SVE 0
+%define HAVE_SVE2 0
%define HAVE_MIPS32 0
%define HAVE_DSPR2 0
%define HAVE_MSA 0
diff --git a/media/libvpx/config/win/ia32/vpx_config.c b/media/libvpx/config/win/ia32/vpx_config.c
index 33c836213b..8cdd6c30b2 100644
--- a/media/libvpx/config/win/ia32/vpx_config.c
+++ b/media/libvpx/config/win/ia32/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86-win32-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86-win32-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/ia32/config.log";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/win/ia32/vpx_config.h b/media/libvpx/config/win/ia32/vpx_config.h
index 9fe256f4ad..b62188c71c 100644
--- a/media/libvpx/config/win/ia32/vpx_config.h
+++ b/media/libvpx/config/win/ia32/vpx_config.h
@@ -22,6 +22,7 @@
#define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_MIPS32 0
#define HAVE_DSPR2 0
#define HAVE_MSA 0
diff --git a/media/libvpx/config/win/x64/vpx_config.asm b/media/libvpx/config/win/x64/vpx_config.asm
index a1d34d6d37..d5f5f3968e 100644
--- a/media/libvpx/config/win/x64/vpx_config.asm
+++ b/media/libvpx/config/win/x64/vpx_config.asm
@@ -10,6 +10,7 @@
%define HAVE_NEON_DOTPROD 0
%define HAVE_NEON_I8MM 0
%define HAVE_SVE 0
+%define HAVE_SVE2 0
%define HAVE_MIPS32 0
%define HAVE_DSPR2 0
%define HAVE_MSA 0
diff --git a/media/libvpx/config/win/x64/vpx_config.c b/media/libvpx/config/win/x64/vpx_config.c
index 8c04c1a3cf..57904c7dc6 100644
--- a/media/libvpx/config/win/x64/vpx_config.c
+++ b/media/libvpx/config/win/x64/vpx_config.c
@@ -6,5 +6,5 @@
/* in the file PATENTS. All contributing project authors may */
/* be found in the AUTHORS file in the root of the source tree. */
#include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86_64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86_64-win64-vs15 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --disable-avx512 --enable-postproc --enable-vp9-postproc --as=yasm --log=/home/cm/Work/gecko-dev/media/libvpx/config/win/x64/config.log";
const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/win/x64/vpx_config.h b/media/libvpx/config/win/x64/vpx_config.h
index 068c6d2a99..448f13e4a1 100644
--- a/media/libvpx/config/win/x64/vpx_config.h
+++ b/media/libvpx/config/win/x64/vpx_config.h
@@ -22,6 +22,7 @@
#define HAVE_NEON_DOTPROD 0
#define HAVE_NEON_I8MM 0
#define HAVE_SVE 0
+#define HAVE_SVE2 0
#define HAVE_MIPS32 0
#define HAVE_DSPR2 0
#define HAVE_MSA 0
diff --git a/media/libvpx/generate_sources_mozbuild.sh b/media/libvpx/generate_sources_mozbuild.sh
index ef9bc696f3..4efcb54aa1 100755
--- a/media/libvpx/generate_sources_mozbuild.sh
+++ b/media/libvpx/generate_sources_mozbuild.sh
@@ -169,7 +169,8 @@ function gen_rtcd_header {
# $1 - Header file directory.
# $2 - Config command line.
function gen_config_files {
- ./configure $2 > /dev/null
+ ./configure $2 --log=$BASE_DIR/$LIBVPX_CONFIG_DIR/$1/config.log > /dev/null
+ echo "Log file: $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/config.log"
# Disable HAVE_UNISTD_H.
( echo '/HAVE_UNISTD_H'; echo 'd' ; echo 'w' ; echo 'q' ) | ed -s vpx_config.h
@@ -203,6 +204,7 @@ all_platforms="${all_platforms} --disable-avx512"
x86_platforms="--enable-postproc --enable-vp9-postproc --as=yasm"
arm_platforms="--enable-runtime-cpu-detect --enable-realtime-only"
arm64_platforms="--enable-realtime-only"
+disable_sve="--disable-sve" # Bug 1885585
gen_config_files linux/x64 "--target=x86_64-linux-gcc ${all_platforms} ${x86_platforms}"
gen_config_files linux/ia32 "--target=x86-linux-gcc ${all_platforms} ${x86_platforms}"
@@ -213,7 +215,7 @@ gen_config_files win/ia32 "--target=x86-win32-gcc ${all_platforms} ${x86_platfor
gen_config_files linux/arm "--target=armv7-linux-gcc ${all_platforms} ${arm_platforms}"
gen_config_files linux/arm64 "--target=arm64-linux-gcc ${all_platforms} ${arm64_platforms}"
-gen_config_files win/aarch64 "--target=arm64-win64-vs15 ${all_platforms} ${arm64_platforms}"
+gen_config_files win/aarch64 "--target=arm64-win64-vs15 ${all_platforms} ${arm64_platforms} ${disable_sve}" # Bug 1885585
gen_config_files generic "--target=generic-gnu ${all_platforms}"
@@ -236,7 +238,7 @@ gen_rtcd_header win/ia32 x86
gen_rtcd_header linux/arm armv7
gen_rtcd_header linux/arm64 arm64
-gen_rtcd_header win/aarch64 arm64
+gen_rtcd_header win/aarch64 arm64 $disable_sve # Bug 1885585
gen_rtcd_header generic generic
@@ -275,6 +277,7 @@ config=$(print_config linux/arm64)
make_clean
make libvpx_srcs.txt target=libs $config > /dev/null
convert_srcs_to_project_files libvpx_srcs.txt ARM64
+# Bug 1885585: The sve files will be excluded from the win/aarch64 build in moz.build.
echo "Generate generic source list."
config=$(print_config generic)
diff --git a/media/libvpx/input_frame_validation.patch b/media/libvpx/input_frame_validation.patch
index 1cb33e192f..37f755e022 100644
--- a/media/libvpx/input_frame_validation.patch
+++ b/media/libvpx/input_frame_validation.patch
@@ -8,15 +8,15 @@ MozReview-Commit-ID: BxDCnJe0mzs
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
-@@ -921,20 +921,29 @@ static vpx_codec_err_t vp8e_encode(vpx_c
- dst_time_stamp =
- pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
- dst_end_time_stamp = (pts_val + (int64_t)duration) *
- ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+@@ -989,20 +989,29 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
+ &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "conversion of relative pts + duration to ticks would overflow");
+ }
+ dst_end_time_stamp =
+ pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
- if (img != NULL) {
res = image2yuvconfig(img, &sd);
-
+
- if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd,
- dst_time_stamp, dst_end_time_stamp)) {
- VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
diff --git a/media/libvpx/libvpx/.mailmap b/media/libvpx/libvpx/.mailmap
index bb0ddd95b2..7206b5ebec 100644
--- a/media/libvpx/libvpx/.mailmap
+++ b/media/libvpx/libvpx/.mailmap
@@ -20,6 +20,7 @@ Hui Su <huisu@google.com>
Jacky Chen <jackychen@google.com>
Jim Bankoski <jimbankoski@google.com>
Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
Johann <johann@duck.com> <johann.koenig@gmail.com>
@@ -53,4 +54,4 @@ Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
Yaowu Xu <yaowu@google.com> <Yaowu Xu>
Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>
Vitaly Buka <vitalybuka@chromium.org> <vitlaybuka@chromium.org>
-xiwei gu <guxiwei-hf@loongson.cn>
+Xiwei Gu <guxiwei-hf@loongson.cn>
diff --git a/media/libvpx/libvpx/AUTHORS b/media/libvpx/libvpx/AUTHORS
index 2db4a113e4..5515e26589 100644
--- a/media/libvpx/libvpx/AUTHORS
+++ b/media/libvpx/libvpx/AUTHORS
@@ -25,6 +25,7 @@ Andrew Salkeld <andrew.salkeld@arm.com>
Angie Chen <yunqi@google.com>
Angie Chiang <angiebird@google.com>
Anton Venema <anton.venema@liveswitch.com>
+Anupam Pandey <anupam.pandey@ittiam.com>
Aron Rosenberg <arosenberg@logitech.com>
Attila Nagy <attilanagy@google.com>
Birk Magnussen <birk.magnussen@googlemail.com>
@@ -34,6 +35,8 @@ Brion Vibber <bvibber@wikimedia.org>
changjun.yang <changjun.yang@intel.com>
Charles 'Buck' Krasic <ckrasic@google.com>
Cheng Chen <chengchen@google.com>
+Chen Wang <wangchen20@iscas.ac.cn>
+Cherma Rajan A <cherma.rajan@ittiam.com>
Chi Yo Tsai <chiyotsai@google.com>
chm <chm@rock-chips.com>
Chris Cunningham <chcunningham@chromium.org>
@@ -60,6 +63,8 @@ Fritz Koenig <frkoenig@google.com>
Fyodor Kyslov <kyslov@google.com>
Gabriel Marin <gmx@chromium.org>
Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+George Steed <george.steed@arm.com>
+Gerda Zsejke More <gerdazsejke.more@arm.com>
Geza Lore <gezalore@gmail.com>
Ghislain MARY <ghislainmary2@gmail.com>
Giuseppe Scrivano <gscrivano@gnu.org>
@@ -103,6 +108,7 @@ Jin Bo <jinbo@loongson.cn>
Jingning Han <jingning@google.com>
Joel Fernandes <joelaf@google.com>
Joey Parrish <joeyparrish@google.com>
+Johann <johann@duck.com>
Johann Koenig <johannkoenig@google.com>
John Koleszar <jkoleszar@google.com>
Johnny Klonaris <google@jawknee.com>
@@ -120,6 +126,7 @@ KO Myung-Hun <komh@chollian.net>
Konstantinos Margaritis <konma@vectorcamp.gr>
Kyle Siefring <kylesiefring@gmail.com>
Lawrence Velázquez <larryv@macports.org>
+L. E. Segovia <amy@amyspark.me>
Linfeng Zhang <linfengz@google.com>
Liu Peng <pengliu.mail@gmail.com>
Lou Quillio <louquillio@google.com>
@@ -147,6 +154,7 @@ Mirko Bonadei <mbonadei@google.com>
Moriyoshi Koizumi <mozo@mozo.jp>
Morton Jonuschat <yabawock@gmail.com>
Nathan E. Egge <negge@mozilla.com>
+Neeraj Gadgil <neeraj.gadgil@ittiam.com>
Neil Birkbeck <neil.birkbeck@gmail.com>
Nico Weber <thakis@chromium.org>
Niveditha Rau <niveditha.rau@gmail.com>
@@ -213,7 +221,8 @@ Vitaly Buka <vitalybuka@chromium.org>
Vlad Tsyrklevich <vtsyrklevich@chromium.org>
Wan-Teh Chang <wtc@google.com>
Wonkap Jang <wonkap@google.com>
-xiwei gu <guxiwei-hf@loongson.cn>
+Xiahong Bao <xiahong.bao@nxp.com>
+Xiwei Gu <guxiwei-hf@loongson.cn>
Yaowu Xu <yaowu@google.com>
Yi Luo <luoyi@google.com>
Yongzhe Wang <yongzhe@google.com>
diff --git a/media/libvpx/libvpx/CHANGELOG b/media/libvpx/libvpx/CHANGELOG
index 21070785ed..87f0d7f708 100644
--- a/media/libvpx/libvpx/CHANGELOG
+++ b/media/libvpx/libvpx/CHANGELOG
@@ -1,7 +1,79 @@
-20yy-mm-dd v1.14.0 "V Duck"
+2024-01-02 v1.14.0 "Venetian Duck"
This release drops support for old C compilers, such as Visual Studio 2012
and older, that disallow mixing variable declarations and statements (a C99
- feature).
+ feature). It adds support for run-time CPU feature detection for Arm
+ platforms, as well as support for darwin23 (macOS 14).
+
+ - Upgrading:
+ This release is ABI incompatible with the previous release.
+
+ Various new features for rate control library for real-time: SVC parallel
+ encoding, loopfilter level, support for frame dropping, and screen content.
+
+ New callback function send_tpl_gop_stats for vp9 external rate control
+ library, which can be used to transmit TPL stats for a group of pictures. A
+ public header vpx_tpl.h is added for the definition of TPL stats used in
+ this callback.
+
+ libwebm is upgraded to libwebm-1.0.0.29-9-g1930e3c.
+
+ - Enhancement:
+ Improvements on Neon optimizations: VoD: 12-35% speed up for bitdepth 8,
+ 68%-151% speed up for high bitdepth.
+
+ Improvements on AVX2 and SSE optimizations.
+ Improvements on LSX optimizations for LoongArch.
+ 42-49% speedup on speed 0 VoD encoding.
+ Android API level predicates.
+
+ - Bug fixes:
+ Fix to missing prototypes from the rtcd header.
+ Fix to segfault when total size is enlarged but width is smaller.
+ Fix to the build for arm64ec using MSVC.
+ Fix to copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic.
+ Fix to -Wshadow warnings.
+ Fix to heap overflow in vpx_get4x4sse_cs_neon.
+ Fix to buffer overrun in highbd Neon subpel variance filters.
+ Added bitexact encode test script.
+ Fix to -Wl,-z,defs with Clang's sanitizers.
+ Fix to decoder stability after error & continued decoding.
+ Fix to mismatch of VP9 encode with NEON intrinsics with C only version.
+ Fix to Arm64 MSVC compile vpx_highbd_fdct4x4_neon.
+ Fix to fragments count before use.
+ Fix to a case where target bandwidth is 0 for SVC.
+ Fix mask in vp9_quantize_avx2,highbd_get_max_lane_eob.
+ Fix to int overflow in vp9_calc_pframe_target_size_one_pass_cbr.
+ Fix to integer overflow in vp8,ratectrl.c.
+ Fix to integer overflow in vp9 svc.
+ Fix to avg_frame_bandwidth overflow.
+ Fix to per frame qp for temporal layers.
+ Fix to unsigned integer overflow in sse computation.
+ Fix to uninitialized mesh feature for BEST mode.
+ Fix to overflow in highbd temporal_filter.
+ Fix to unaligned loads w/w==4 in vpx_convolve_copy_neon.
+ Skip arm64_neon.h workaround w/VS >= 2019.
+ Fix to c vs avx mismatch of diamond_search_sad().
+ Fix to c vs intrinsic mismatch of vpx_hadamard_32x32() function.
+ Fix to a bug in vpx_hadamard_32x32_neon().
+ Fix to Clang -Wunreachable-code-aggressive warnings.
+ Fix to a bug in vpx_highbd_hadamard_32x32_neon().
+ Fix to -Wunreachable-code in mfqe_partition.
+ Force mode search on 64x64 if no mode is selected.
+ Fix to ubsan failure caused by left shift of negative.
+ Fix to integer overflow in calc_pframe_target_size.
+ Fix to float-cast-overflow in vp8_change_config().
+ Fix to a null ptr before use.
+ Conditionally skip using inter frames in speed features.
+ Remove invalid reference frames.
+ Disable intra mode search speed features conditionally.
+ Set nonrd keyframe under dynamic change of deadline for rtc.
+ Fix to scaled reference offsets.
+ Set skip_recode=0 in nonrd_pick_sb_modes.
+ Fix to an edge case when downsizing to one.
+ Fix to a bug in frame scaling.
+ Fix to pred buffer stride.
+ Fix to a bug in simple motion search.
+ Update frame size in actual encoding.
2023-09-29 v1.13.1 "Ugly Duckling"
This release contains two security related fixes. One each for VP8 and VP9.
diff --git a/media/libvpx/libvpx/README b/media/libvpx/libvpx/README
index 4c25b15d81..6dbd164c34 100644
--- a/media/libvpx/libvpx/README
+++ b/media/libvpx/libvpx/README
@@ -1,5 +1,3 @@
-v1.13.1 Ugly Duckling
-
Welcome to the WebM VP8/VP9 Codec SDK!
COMPILING THE APPLICATIONS/LIBRARIES:
@@ -183,6 +181,44 @@ CODE STYLE:
See also: http://clang.llvm.org/docs/ClangFormat.html
+PROFILE GUIDED OPTIMIZATION (PGO)
+ Profile Guided Optimization can be enabled for Clang builds using the
+ commands:
+
+ $ export CC=clang
+ $ export CXX=clang++
+ $ ../libvpx/configure --enable-profile
+ $ make
+
+ Generate one or multiple PGO profile files by running vpxdec or vpxenc. For
+ example:
+
+ $ ./vpxdec ../vpx/out_ful/vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm \
+ -o - > /dev/null
+
+ To convert and merge the raw profile files, use the llvm-profdata tool:
+
+ $ llvm-profdata merge -o perf.profdata default_8382761441159425451_0.profraw
+
+ Then, rebuild the project with the new profile file:
+
+ $ make clean
+ $ ../libvpx/configure --use-profile=perf.profdata
+ $ make
+
+ Note: Always use the llvm-profdata from the toolchain that is used for
+ compiling the PGO-enabled binary.
+
+ To observe the improvements from a PGO-enabled build, enable and compare the
+ list of failed optimizations by using the -Rpass-missed compiler flag. For
+ example, to list the failed loop vectorizations:
+
+ $ ../libvpx/configure --use-profile=perf.profdata \
+ --extra-cflags=-Rpass-missed=loop-vectorize
+
+ For guidance on utilizing PGO files to identify potential optimization
+ opportunities, see: tools/README.pgo.md
+
SUPPORT
This library is an open source project supported by its community. Please
email webm-discuss@webmproject.org for help.
diff --git a/media/libvpx/libvpx/build/make/Android.mk b/media/libvpx/libvpx/build/make/Android.mk
index ba24f541b1..533f43c1c2 100644
--- a/media/libvpx/libvpx/build/make/Android.mk
+++ b/media/libvpx/libvpx/build/make/Android.mk
@@ -15,13 +15,9 @@ ifdef NDK_ROOT
# In an Android project place a libvpx checkout in the jni directory.
# Run the configure script from the jni directory. Base libvpx
# encoder/decoder configuration will look similar to:
-# ./libvpx/configure --target=armv7-android-gcc --disable-examples \
+# ./libvpx/configure --target=arm64-android-gcc --disable-examples \
# --enable-external-build
#
-# When targeting Android, realtime-only is enabled by default. This can
-# be overridden by adding the command line flag:
-# --disable-realtime-only
-#
# This will create .mk files that contain variables that contain the
# source files to compile.
#
@@ -38,11 +34,14 @@ ifdef NDK_ROOT
# but the resulting library *must* be run on devices supporting all of the
# enabled extensions. They can be disabled individually with
# --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512}
-# --disable-neon[-asm]
+# --disable-neon{, -asm, -neon-dotprod, -neon-i8mm}
+# --disable-sve
# --disable-{dspr2, msa}
#
-# Running ndk-build will build libvpx and include it in your project.
+# Running ndk-build will build libvpx and include it in your project. Set
+# APP_ABI to match the --target passed to configure:
+# https://developer.android.com/ndk/guides/application_mk#app_abi.
#
CONFIG_DIR := $(LOCAL_PATH)/
diff --git a/media/libvpx/libvpx/build/make/Makefile b/media/libvpx/libvpx/build/make/Makefile
index 199ed78058..658b37617b 100644
--- a/media/libvpx/libvpx/build/make/Makefile
+++ b/media/libvpx/libvpx/build/make/Makefile
@@ -150,6 +150,8 @@ $(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm
$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm
$(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
$(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+$(BUILD_PFX)%_sve2.c.d: CFLAGS += -march=armv9-a+sve2
+$(BUILD_PFX)%_sve2.c.o: CFLAGS += -march=armv9-a+sve2
# POWER
$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
diff --git a/media/libvpx/libvpx/build/make/configure.sh b/media/libvpx/libvpx/build/make/configure.sh
index 869793a296..009bf7db5c 100644
--- a/media/libvpx/libvpx/build/make/configure.sh
+++ b/media/libvpx/libvpx/build/make/configure.sh
@@ -74,6 +74,8 @@ Build options:
--cpu=CPU optimize for a specific cpu rather than a family
--extra-cflags=ECFLAGS add ECFLAGS to CFLAGS [$CFLAGS]
--extra-cxxflags=ECXXFLAGS add ECXXFLAGS to CXXFLAGS [$CXXFLAGS]
+ --use-profile=PROFILE_FILE
+ Use PROFILE_FILE for PGO
${toggle_extra_warnings} emit harmless warnings (always non-fatal)
${toggle_werror} treat warnings as errors, if possible
(not available with all compilers)
@@ -81,6 +83,7 @@ Build options:
${toggle_pic} turn on/off Position Independent Code
${toggle_ccache} turn on/off compiler cache
${toggle_debug} enable/disable debug mode
+ ${toggle_profile} enable/disable profiling
${toggle_gprof} enable/disable gprof profiling instrumentation
${toggle_gcov} enable/disable gcov coverage instrumentation
${toggle_thumb} enable/disable building arm assembly in thumb mode
@@ -429,6 +432,26 @@ check_gcc_machine_options() {
fi
}
+check_neon_sve_bridge_compiles() {
+ if enabled sve; then
+ check_cc -march=armv8.2-a+dotprod+i8mm+sve <<EOF
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+EOF
+ compile_result=$?
+ if [ ${compile_result} -ne 0 ]; then
+ log_echo " disabling sve: arm_neon_sve_bridge.h not supported by compiler"
+ log_echo " disabling sve2: arm_neon_sve_bridge.h not supported by compiler"
+ disable_feature sve
+ disable_feature sve2
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sve --disable-sve2 "
+ fi
+ fi
+}
+
check_gcc_avx512_compiles() {
if disabled gcc; then
return
@@ -611,6 +634,9 @@ process_common_cmdline() {
--extra-cxxflags=*)
extra_cxxflags="${optval}"
;;
+ --use-profile=*)
+ pgo_file=${optval}
+ ;;
--enable-?*|--disable-?*)
eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
if is_in ${option} ${ARCH_EXT_LIST}; then
@@ -951,7 +977,7 @@ EOF
add_cflags "-mmacosx-version-min=10.15"
add_ldflags "-mmacosx-version-min=10.15"
;;
- *-darwin2[0-2]-*)
+ *-darwin2[0-3]-*)
add_cflags "-arch ${toolchain%%-*}"
add_ldflags "-arch ${toolchain%%-*}"
;;
@@ -980,36 +1006,18 @@ EOF
case ${toolchain} in
arm*)
soft_enable runtime_cpu_detect
- # Arm ISA extensions are treated as supersets.
- case ${tgt_isa} in
- arm64|armv8)
- for ext in ${ARCH_EXT_LIST_AARCH64}; do
- # Disable higher order extensions to simplify dependencies.
- if [ "$disable_exts" = "yes" ]; then
- if ! disabled $ext; then
- RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
- disable_feature $ext
- fi
- elif disabled $ext; then
- disable_exts="yes"
- else
- soft_enable $ext
- fi
- done
- ;;
- armv7|armv7s)
- soft_enable neon
- # Only enable neon_asm when neon is also enabled.
- enabled neon && soft_enable neon_asm
- # If someone tries to force it through, die.
- if disabled neon && enabled neon_asm; then
- die "Disabling neon while keeping neon-asm is not supported"
- fi
- ;;
- esac
- asm_conversion_cmd="cat"
+ if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
+ soft_enable neon
+ # Only enable neon_asm when neon is also enabled.
+ enabled neon && soft_enable neon_asm
+ # If someone tries to force it through, die.
+ if disabled neon && enabled neon_asm; then
+ die "Disabling neon while keeping neon-asm is not supported"
+ fi
+ fi
+ asm_conversion_cmd="cat"
case ${tgt_cc} in
gcc)
link_with_cc=gcc
@@ -1228,6 +1236,38 @@ EOF
fi
;;
esac
+
+ # AArch64 ISA extensions are treated as supersets.
+ if [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then
+ aarch64_arch_flag_neon="arch=armv8-a"
+ aarch64_arch_flag_neon_dotprod="arch=armv8.2-a+dotprod"
+ aarch64_arch_flag_neon_i8mm="arch=armv8.2-a+dotprod+i8mm"
+ aarch64_arch_flag_sve="arch=armv8.2-a+dotprod+i8mm+sve"
+ aarch64_arch_flag_sve2="arch=armv9-a+sve2"
+ for ext in ${ARCH_EXT_LIST_AARCH64}; do
+ if [ "$disable_exts" = "yes" ]; then
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+ soft_disable $ext
+ else
+ # Check the compiler supports the -march flag for the extension.
+ # This needs to happen after toolchain/OS inspection so we handle
+ # $CROSS etc correctly when checking for flags, else these will
+ # always fail.
+ flag="$(eval echo \$"aarch64_arch_flag_${ext}")"
+ check_gcc_machine_option "${flag}" "${ext}"
+ if ! enabled $ext; then
+ # Disable higher order extensions to simplify dependencies.
+ disable_exts="yes"
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+ soft_disable $ext
+ fi
+ fi
+ done
+ if enabled sve; then
+ check_neon_sve_bridge_compiles
+ fi
+ fi
+
;;
mips*)
link_with_cc=gcc
@@ -1484,6 +1524,14 @@ EOF
;;
esac
+ # Enable PGO
+ if [ -n "${pgo_file}" ]; then
+ check_add_cflags -fprofile-use=${pgo_file} || \
+ die "-fprofile-use is not supported by compiler"
+ check_add_ldflags -fprofile-use=${pgo_file} || \
+ die "-fprofile-use is not supported by linker"
+ fi
+
# Try to enable CPU specific tuning
if [ -n "${tune_cpu}" ]; then
if [ -n "${tune_cflags}" ]; then
@@ -1504,6 +1552,9 @@ EOF
else
check_add_cflags -DNDEBUG
fi
+ enabled profile &&
+ check_add_cflags -fprofile-generate &&
+ check_add_ldflags -fprofile-generate
enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
enabled gcov &&
diff --git a/media/libvpx/libvpx/build/make/rtcd.pl b/media/libvpx/libvpx/build/make/rtcd.pl
index 0b9e16738e..025238d678 100755
--- a/media/libvpx/libvpx/build/make/rtcd.pl
+++ b/media/libvpx/libvpx/build/make/rtcd.pl
@@ -487,7 +487,7 @@ if ($opts{arch} eq 'x86') {
@ALL_ARCHS = filter(qw/neon_asm neon/);
arm;
} elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
- @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/);
+ @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve sve2/);
@REQUIRES = filter(qw/neon/);
&require(@REQUIRES);
arm;
diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure
index b212e0709d..97e78996e8 100755
--- a/media/libvpx/libvpx/configure
+++ b/media/libvpx/libvpx/configure
@@ -260,6 +260,7 @@ ARCH_EXT_LIST_AARCH64="
neon_dotprod
neon_i8mm
sve
+ sve2
"
ARCH_EXT_LIST_X86="
@@ -376,6 +377,7 @@ CMDLINE_SELECT="
install_libs
install_srcs
debug
+ profile
gprof
gcov
pic
@@ -659,6 +661,7 @@ process_toolchain() {
check_add_cflags -Wmissing-declarations
check_add_cflags -Wmissing-prototypes
check_add_cflags -Wshadow
+ check_add_cflags -Wstrict-prototypes
check_add_cflags -Wuninitialized
check_add_cflags -Wunreachable-code-aggressive
check_add_cflags -Wunused
@@ -677,6 +680,10 @@ process_toolchain() {
# would be needed to apply this only to test/*.cc.
check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
+ # Do not allow implicit vector type conversions on Clang builds (this
+ # is already the default on GCC builds).
+ check_add_cflags -flax-vector-conversions=none
+
# Quiet gcc 6 vs 7 abi warnings:
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
if enabled arm; then
diff --git a/media/libvpx/libvpx/examples/resize_util.c b/media/libvpx/libvpx/examples/resize_util.c
index 5fb63e1660..083bd2519d 100644
--- a/media/libvpx/libvpx/examples/resize_util.c
+++ b/media/libvpx/libvpx/examples/resize_util.c
@@ -20,7 +20,7 @@
static const char *exec_name = NULL;
-static void usage() {
+static void usage(void) {
printf("Usage:\n");
printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
exec_name);
diff --git a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
index 998e4fb20d..4050c093cd 100644
--- a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -1156,12 +1156,13 @@ int main(int argc, const char **argv) {
#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id);
// Don't look for mismatch on top spatial and top temporal layers as they
- // are non reference frames.
+ // are non reference frames. Don't look at frames whose top spatial layer
+ // is dropped.
if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) &&
+ cx_pkt->data.frame
+ .spatial_layer_encoded[enc_cfg.ss_number_layers - 1] &&
!(layer_id.temporal_layer_id > 0 &&
- layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1 &&
- cx_pkt->data.frame
- .spatial_layer_encoded[enc_cfg.ss_number_layers - 1])) {
+ layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1)) {
test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen);
}
#endif
diff --git a/media/libvpx/libvpx/examples/vp9cx_set_ref.c b/media/libvpx/libvpx/examples/vp9cx_set_ref.c
index 1a0823153b..6e12d668b0 100644
--- a/media/libvpx/libvpx/examples/vp9cx_set_ref.c
+++ b/media/libvpx/libvpx/examples/vp9cx_set_ref.c
@@ -60,7 +60,7 @@
static const char *exec_name;
-void usage_exit() {
+void usage_exit(void) {
fprintf(stderr,
"Usage: %s <width> <height> <infile> <outfile> "
"<frame> <limit(optional)>\n",
diff --git a/media/libvpx/libvpx/libs.doxy_template b/media/libvpx/libvpx/libs.doxy_template
index 1ee442af3e..6d05162d00 100644
--- a/media/libvpx/libvpx/libs.doxy_template
+++ b/media/libvpx/libvpx/libs.doxy_template
@@ -1223,14 +1223,6 @@ DOT_GRAPH_MAX_NODES = 50
MAX_DOT_GRAPH_DEPTH = 0
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, which results in a white background.
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-
-DOT_TRANSPARENT = YES
-
# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
# files in one run (i.e. multiple -o and -T options on the command line). This
# makes dot run faster, but since only newer versions of dot (>1.8.10)
diff --git a/media/libvpx/libvpx/libs.mk b/media/libvpx/libvpx/libs.mk
index ff1c569c3b..5964386710 100644
--- a/media/libvpx/libvpx/libs.mk
+++ b/media/libvpx/libvpx/libs.mk
@@ -313,9 +313,9 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
# SO_VERSION_* then follow the rules in the link to detemine the new version
# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
-SO_VERSION_MAJOR := 8
+SO_VERSION_MAJOR := 9
SO_VERSION_MINOR := 0
-SO_VERSION_PATCH := 1
+SO_VERSION_PATCH := 0
ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib
SHARED_LIB_SUF := .dylib
diff --git a/media/libvpx/libvpx/test/android/get_files.py b/media/libvpx/libvpx/test/android/get_files.py
index 1c69740d2b..98ce7b1947 100644
--- a/media/libvpx/libvpx/test/android/get_files.py
+++ b/media/libvpx/libvpx/test/android/get_files.py
@@ -38,7 +38,7 @@ def get_file_sha(filename):
buf = file.read(HASH_CHUNK)
return sha_hash.hexdigest()
except IOError:
- print "Error reading " + filename
+ print("Error reading " + filename)
# Downloads a file from a url, and then checks the sha against the passed
# in sha
@@ -67,7 +67,7 @@ try:
getopt.getopt(sys.argv[1:], \
"u:i:o:", ["url=", "input_csv=", "output_dir="])
except:
- print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
+ print('get_files.py -u <url> -i <input_csv> -o <output_dir>')
sys.exit(2)
for opt, arg in opts:
@@ -79,7 +79,7 @@ for opt, arg in opts:
local_resource_path = os.path.join(arg)
if len(sys.argv) != 7:
- print "Expects two paths and a url!"
+ print("Expects two paths and a url!")
exit(1)
if not os.path.isdir(local_resource_path):
@@ -89,7 +89,7 @@ file_list_csv = open(file_list_path, "rb")
# Our 'csv' file uses multiple spaces as a delimiter, python's
# csv class only uses single character delimiters, so we convert them below
-file_list_reader = csv.reader((re.sub(' +', ' ', line) \
+file_list_reader = csv.reader((re.sub(' +', ' ', line.decode('utf-8')) \
for line in file_list_csv), delimiter = ' ')
file_shas = []
@@ -104,15 +104,16 @@ for row in file_list_reader:
file_list_csv.close()
# Download files, only if they don't already exist and have correct shas
-for filename, sha in itertools.izip(file_names, file_shas):
+for filename, sha in zip(file_names, file_shas):
+ filename = filename.lstrip('*')
path = os.path.join(local_resource_path, filename)
if os.path.isfile(path) \
and get_file_sha(path) == sha:
- print path + ' exists, skipping'
+ print(path + ' exists, skipping')
continue
for retry in range(0, ftp_retries):
- print "Downloading " + path
+ print("Downloading " + path)
if not download_and_check_sha(url, filename, sha):
- print "Sha does not match, retrying..."
+ print("Sha does not match, retrying...")
else:
break
diff --git a/media/libvpx/libvpx/test/avg_test.cc b/media/libvpx/libvpx/test/avg_test.cc
index ede9c0ba8c..7816912ff7 100644
--- a/media/libvpx/libvpx/test/avg_test.cc
+++ b/media/libvpx/libvpx/test/avg_test.cc
@@ -719,6 +719,15 @@ INSTANTIATE_TEST_SUITE_P(
make_tuple(1024, &vp9_block_error_fp_neon)));
#endif // HAVE_NEON
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, BlockErrorTestFP,
+ ::testing::Values(make_tuple(16, &vp9_block_error_fp_sve),
+ make_tuple(64, &vp9_block_error_fp_sve),
+ make_tuple(256, &vp9_block_error_fp_sve),
+ make_tuple(1024, &vp9_block_error_fp_sve)));
+#endif // HAVE_SVE
+
#if HAVE_MSA
INSTANTIATE_TEST_SUITE_P(
MSA, AverageTest,
diff --git a/media/libvpx/libvpx/test/codec_factory.h b/media/libvpx/libvpx/test/codec_factory.h
index c7e8f54847..179ccdf011 100644
--- a/media/libvpx/libvpx/test/codec_factory.h
+++ b/media/libvpx/libvpx/test/codec_factory.h
@@ -164,7 +164,9 @@ const libvpx_test::VP8CodecFactory kVP8;
&libvpx_test::kVP8)), \
__VA_ARGS__))
#else
-#define VP8_INSTANTIATE_TEST_SUITE(test, ...)
+// static_assert() is used to avoid warnings about an extra ';' outside of a
+// function.
+#define VP8_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP8 == 0, "")
#endif // CONFIG_VP8
/*
@@ -259,7 +261,9 @@ const libvpx_test::VP9CodecFactory kVP9;
&libvpx_test::kVP9)), \
__VA_ARGS__))
#else
-#define VP9_INSTANTIATE_TEST_SUITE(test, ...)
+// static_assert() is used to avoid warnings about an extra ';' outside of a
+// function.
+#define VP9_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP9 == 0, "")
#endif // CONFIG_VP9
} // namespace libvpx_test
diff --git a/media/libvpx/libvpx/test/convolve_test.cc b/media/libvpx/libvpx/test/convolve_test.cc
index ffd5c41c63..11f7625137 100644
--- a/media/libvpx/libvpx/test/convolve_test.cc
+++ b/media/libvpx/libvpx/test/convolve_test.cc
@@ -1218,6 +1218,24 @@ WRAP(convolve8_neon, 12)
WRAP(convolve8_avg_neon, 12)
#endif // HAVE_NEON
+#if HAVE_SVE
+WRAP(convolve8_horiz_sve, 8)
+WRAP(convolve8_avg_horiz_sve, 8)
+WRAP(convolve8_horiz_sve, 10)
+WRAP(convolve8_avg_horiz_sve, 10)
+WRAP(convolve8_horiz_sve, 12)
+WRAP(convolve8_avg_horiz_sve, 12)
+#endif // HAVE_SVE
+
+#if HAVE_SVE2
+WRAP(convolve8_vert_sve2, 8)
+WRAP(convolve8_avg_vert_sve2, 8)
+WRAP(convolve8_vert_sve2, 10)
+WRAP(convolve8_avg_vert_sve2, 10)
+WRAP(convolve8_vert_sve2, 12)
+WRAP(convolve8_avg_vert_sve2, 12)
+#endif // HAVE_SVE2
+
WRAP(convolve_copy_c, 8)
WRAP(convolve_avg_c, 8)
WRAP(convolve8_horiz_c, 8)
@@ -1438,6 +1456,74 @@ INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest,
::testing::ValuesIn(kArrayConvolve_neon_dotprod));
#endif // HAVE_NEON_DOTPROD
+#if HAVE_SVE
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sve(
+ wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_sve_8,
+ wrap_convolve8_avg_horiz_sve_8, wrap_convolve8_vert_c_8,
+ wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+ wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+ wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
+ wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_sve(
+ wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+ wrap_convolve8_horiz_sve_10, wrap_convolve8_avg_horiz_sve_10,
+ wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
+ wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
+ wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+ wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+ 10);
+const ConvolveFunctions convolve12_sve(
+ wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+ wrap_convolve8_horiz_sve_12, wrap_convolve8_avg_horiz_sve_12,
+ wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
+ wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
+ wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+ wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+ 12);
+
+const ConvolveParam kArrayConvolve_sve[] = { ALL_SIZES(convolve8_sve),
+ ALL_SIZES(convolve10_sve),
+ ALL_SIZES(convolve12_sve) };
+INSTANTIATE_TEST_SUITE_P(SVE, ConvolveTest,
+ ::testing::ValuesIn(kArrayConvolve_sve));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_SVE
+
+#if HAVE_SVE2
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sve2(
+ wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8,
+ wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_sve2_8,
+ wrap_convolve8_avg_vert_sve2_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+ wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+ wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
+ wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_sve2(
+ wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10,
+ wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_sve2_10,
+ wrap_convolve8_avg_vert_sve2_10, wrap_convolve8_c_10,
+ wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
+ wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+ wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+ 10);
+const ConvolveFunctions convolve12_sve2(
+ wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12,
+ wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_sve2_12,
+ wrap_convolve8_avg_vert_sve2_12, wrap_convolve8_c_12,
+ wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
+ wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+ wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+ 12);
+
+const ConvolveParam kArrayConvolve_sve2[] = { ALL_SIZES(convolve8_sve2),
+ ALL_SIZES(convolve10_sve2),
+ ALL_SIZES(convolve12_sve2) };
+INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest,
+ ::testing::ValuesIn(kArrayConvolve_sve2));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_SVE2
+
#if HAVE_NEON_I8MM
const ConvolveFunctions convolve8_neon_i8mm(
vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm,
diff --git a/media/libvpx/libvpx/test/encode_api_test.cc b/media/libvpx/libvpx/test/encode_api_test.cc
index 508083673a..ca3b17a5d5 100644
--- a/media/libvpx/libvpx/test/encode_api_test.cc
+++ b/media/libvpx/libvpx/test/encode_api_test.cc
@@ -8,7 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <cassert>
#include <climits>
+#include <cstdint>
#include <cstring>
#include <initializer_list>
#include <new>
@@ -44,6 +46,49 @@ bool IsVP9(vpx_codec_iface_t *iface) {
0;
}
+void *Memset16(void *dest, int val, size_t length) {
+ uint16_t *dest16 = reinterpret_cast<uint16_t *>(dest);
+ for (size_t i = 0; i < length; i++) {
+ *dest16++ = val;
+ }
+ return dest;
+}
+
+vpx_image_t *CreateImage(vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt,
+ unsigned int width, unsigned int height) {
+ assert(fmt != VPX_IMG_FMT_NV12);
+ if (bit_depth > VPX_BITS_8) {
+ fmt = static_cast<vpx_img_fmt_t>(fmt | VPX_IMG_FMT_HIGHBITDEPTH);
+ }
+ vpx_image_t *image = vpx_img_alloc(nullptr, fmt, width, height, 1);
+ if (!image) return image;
+
+ const int val = 1 << (bit_depth - 1);
+ const unsigned int uv_h =
+ (image->d_h + image->y_chroma_shift) >> image->y_chroma_shift;
+ const unsigned int uv_w =
+ (image->d_w + image->x_chroma_shift) >> image->x_chroma_shift;
+ if (bit_depth > VPX_BITS_8) {
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ Memset16(image->planes[0] + i * image->stride[0], val, image->d_w);
+ }
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ Memset16(image->planes[1] + i * image->stride[1], val, uv_w);
+ Memset16(image->planes[2] + i * image->stride[2], val, uv_w);
+ }
+ } else {
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ memset(image->planes[0] + i * image->stride[0], val, image->d_w);
+ }
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ memset(image->planes[1] + i * image->stride[1], val, uv_w);
+ memset(image->planes[2] + i * image->stride[2], val, uv_w);
+ }
+ }
+
+ return image;
+}
+
TEST(EncodeAPI, InvalidParams) {
uint8_t buf[1] = { 0 };
vpx_image_t img;
@@ -198,7 +243,51 @@ TEST(EncodeAPI, RandomPixelsVp8) {
ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
// Generate random frame data and encode
- uint8_t img[1280 * 720 * 3 / 2];
+ libvpx_test::RandomVideoSource video;
+ video.SetSize(cfg.g_w, cfg.g_h);
+ video.SetImageFormat(VPX_IMG_FMT_I420);
+ video.Begin();
+ ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(),
+ /*flags=*/0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_OK);
+
+ // Destroy libvpx encoder
+ vpx_codec_destroy(&enc);
+}
+
+TEST(EncodeAPI, ChangeToL1T3AndSetBitrateVp8) {
+ // Initialize libvpx encoder
+ vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+ vpx_codec_enc_cfg_t cfg;
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+ cfg.g_threads = 1;
+ cfg.g_profile = 0;
+ cfg.g_w = 1;
+ cfg.g_h = 64;
+ cfg.g_bit_depth = VPX_BITS_8;
+ cfg.g_input_bit_depth = 8;
+ cfg.g_timebase.num = 1;
+ cfg.g_timebase.den = 1000000;
+ cfg.g_pass = VPX_RC_ONE_PASS;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_dropframe_thresh = 0; // Don't drop frames
+ cfg.rc_resize_allowed = 0;
+ cfg.rc_end_usage = VPX_VBR;
+ cfg.rc_target_bitrate = 10;
+ cfg.rc_min_quantizer = 2;
+ cfg.rc_max_quantizer = 58;
+ cfg.kf_mode = VPX_KF_AUTO;
+ cfg.kf_min_dist = 0;
+ cfg.kf_max_dist = 10000;
+
+ vpx_codec_ctx_t enc;
+ ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+ ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -6), VPX_CODEC_OK);
+
+ // Generate random frame data and encode
+ uint8_t img[1 * 64 * 3 / 2];
libvpx_test::ACMRandom rng;
for (size_t i = 0; i < sizeof(img); ++i) {
img[i] = rng.Rand8();
@@ -207,13 +296,142 @@ TEST(EncodeAPI, RandomPixelsVp8) {
ASSERT_EQ(
vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img),
&img_wrapper);
- ASSERT_EQ(vpx_codec_encode(&enc, &img_wrapper, 0, 1, 0, VPX_DL_BEST_QUALITY),
+ vpx_enc_frame_flags_t flags = VPX_EFLAG_FORCE_KF;
+ ASSERT_EQ(
+ vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME),
+ VPX_CODEC_OK);
+ ASSERT_EQ(vpx_codec_encode(&enc, nullptr, -1, 0, 0, 0), VPX_CODEC_OK);
+
+ cfg.rc_target_bitrate = 4294967;
+ // Set the scalability mode to L1T3.
+ cfg.ts_number_layers = 3;
+ cfg.ts_periodicity = 4;
+ cfg.ts_layer_id[0] = 0;
+ cfg.ts_layer_id[1] = 2;
+ cfg.ts_layer_id[2] = 1;
+ cfg.ts_layer_id[3] = 2;
+ cfg.ts_rate_decimator[0] = 4;
+ cfg.ts_rate_decimator[1] = 2;
+ cfg.ts_rate_decimator[2] = 1;
+ // Bitrate allocation L0: 50% L1: 20% L2: 30%
+ cfg.layer_target_bitrate[0] = cfg.ts_target_bitrate[0] =
+ 50 * cfg.rc_target_bitrate / 100;
+ cfg.layer_target_bitrate[1] = cfg.ts_target_bitrate[1] =
+ 70 * cfg.rc_target_bitrate / 100;
+ cfg.layer_target_bitrate[2] = cfg.ts_target_bitrate[2] =
+ cfg.rc_target_bitrate;
+ cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+ cfg.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT;
+ ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK);
+
+ ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TEMPORAL_LAYER_ID, 2),
VPX_CODEC_OK);
+ constexpr vpx_enc_frame_flags_t VP8_UPDATE_NOTHING =
+ VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+ // Layer 2: only reference last frame, no updates
+ // It only depends on layer 0
+ flags = VP8_UPDATE_NOTHING | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_GF;
+ ASSERT_EQ(
+ vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME),
+ VPX_CODEC_OK);
+
// Destroy libvpx encoder
vpx_codec_destroy(&enc);
}
-#endif
+
+// Emulates the WebCodecs VideoEncoder interface.
+class VP8Encoder {
+ public:
+ explicit VP8Encoder(int speed) : speed_(speed) {}
+ ~VP8Encoder();
+
+ void Configure(unsigned int threads, unsigned int width, unsigned int height,
+ vpx_rc_mode end_usage, vpx_enc_deadline_t deadline);
+ void Encode(bool key_frame);
+
+ private:
+ const int speed_;
+ bool initialized_ = false;
+ vpx_codec_enc_cfg_t cfg_;
+ vpx_codec_ctx_t enc_;
+ int frame_index_ = 0;
+ vpx_enc_deadline_t deadline_ = 0;
+};
+
+VP8Encoder::~VP8Encoder() {
+ if (initialized_) {
+ EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK);
+ }
+}
+
+void VP8Encoder::Configure(unsigned int threads, unsigned int width,
+ unsigned int height, vpx_rc_mode end_usage,
+ vpx_enc_deadline_t deadline) {
+ deadline_ = deadline;
+
+ if (!initialized_) {
+ vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
+ VPX_CODEC_OK);
+ cfg_.g_threads = threads;
+ cfg_.g_w = width;
+ cfg_.g_h = height;
+ cfg_.g_timebase.num = 1;
+ cfg_.g_timebase.den = 1000 * 1000; // microseconds
+ cfg_.g_pass = VPX_RC_ONE_PASS;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.rc_end_usage = end_usage;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 58;
+ ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+ ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
+ initialized_ = true;
+ return;
+ }
+
+ cfg_.g_threads = threads;
+ cfg_.g_w = width;
+ cfg_.g_h = height;
+ cfg_.rc_end_usage = end_usage;
+ ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK)
+ << vpx_codec_error_detail(&enc_);
+}
+
+void VP8Encoder::Encode(bool key_frame) {
+ const vpx_codec_cx_pkt_t *pkt;
+ vpx_image_t *image =
+ CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg_.g_w, cfg_.g_h);
+ ASSERT_NE(image, nullptr);
+ const vpx_enc_frame_flags_t flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
+ ASSERT_EQ(vpx_codec_encode(&enc_, image, frame_index_, 1, flags, deadline_),
+ VPX_CODEC_OK);
+ ++frame_index_;
+ vpx_codec_iter_t iter = nullptr;
+ while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+ if (key_frame) {
+ ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY);
+ }
+ }
+ vpx_img_free(image);
+}
+
+// This is the reproducer testcase for crbug.com/324459561. However,
+// just running this test is not enough to reproduce the bug. We also
+// need to send signals to the test.
+TEST(EncodeAPI, Chromium324459561) {
+ VP8Encoder encoder(-12);
+
+ encoder.Configure(11, 1685, 652, VPX_CBR, VPX_DL_REALTIME);
+
+ encoder.Encode(true);
+ encoder.Encode(true);
+ encoder.Encode(true);
+
+ encoder.Configure(0, 1685, 1, VPX_VBR, VPX_DL_REALTIME);
+}
+#endif // CONFIG_VP8_ENCODER
// Set up 2 spatial streams with 2 temporal layers per stream, and generate
// invalid configuration by setting the temporal layer rate allocation
@@ -499,6 +717,131 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
}
}
+TEST(EncodeAPI, ConfigResizeBiggerAfterInit) {
+ for (const auto *iface : kCodecIfaces) {
+ SCOPED_TRACE(vpx_codec_iface_name(iface));
+ vpx_codec_enc_cfg_t cfg;
+ vpx_codec_ctx_t enc;
+
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+ EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg));
+
+ cfg.g_w = 1920;
+ cfg.g_h = 1;
+ EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+ IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+ EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+ }
+}
+
+TEST(EncodeAPI, ConfigResizeBiggerAfterEncode) {
+ for (const auto *iface : kCodecIfaces) {
+ SCOPED_TRACE(vpx_codec_iface_name(iface));
+ vpx_codec_enc_cfg_t cfg;
+ vpx_codec_ctx_t enc;
+
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+ EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg));
+ EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc));
+
+ cfg.g_w = 1920;
+ cfg.g_h = 1;
+ EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+ IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+ cfg.g_w = 1920;
+ cfg.g_h = 1080;
+ EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+ IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+ EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+ }
+}
+
+TEST(EncodeAPI, PtsSmallerThanInitialPts) {
+ for (const auto *iface : kCodecIfaces) {
+ // Initialize libvpx encoder.
+ vpx_codec_ctx_t enc;
+ vpx_codec_enc_cfg_t cfg;
+
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+ ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+ // Create input image.
+ vpx_image_t *const image =
+ CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+ ASSERT_NE(image, nullptr);
+
+ // Encode frame.
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 12, 1, 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_OK);
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 13, 1, 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_OK);
+ // pts (10) is smaller than the initial pts (12).
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 10, 1, 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+
+ // Free resources.
+ vpx_img_free(image);
+ ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+ }
+}
+
+TEST(EncodeAPI, PtsOrDurationTooBig) {
+ for (const auto *iface : kCodecIfaces) {
+ // Initialize libvpx encoder.
+ vpx_codec_ctx_t enc;
+ vpx_codec_enc_cfg_t cfg;
+
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+ ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+ // Create input image.
+ vpx_image_t *const image =
+ CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+ ASSERT_NE(image, nullptr);
+
+ // Encode frame.
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_OK);
+#if ULONG_MAX > INT64_MAX
+ // duration is too big.
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, 2),
+ VPX_CODEC_INVALID_PARAM);
+#endif
+ // pts, when converted to ticks, is too big.
+ ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000 + 1, 1, 0,
+ VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+#if ULONG_MAX > INT64_MAX
+ // duration is too big.
+ ASSERT_EQ(
+ vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+ // pts + duration is too big.
+ ASSERT_EQ(
+ vpx_codec_encode(&enc, image, 1, INT64_MAX, 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+#endif
+ // pts + duration, when converted to ticks, is too big.
+#if ULONG_MAX > INT64_MAX
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 0xbd6b566b15c7, 0,
+ VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+#endif
+ ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000, 1, 0,
+ VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+
+ // Free resources.
+ vpx_img_free(image);
+ ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+ }
+}
+
#if CONFIG_VP9_ENCODER
// Frame size needed to trigger the overflow exceeds the max buffer allowed on
// 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY
@@ -528,28 +871,16 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
}
#endif // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
-vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) {
- vpx_image_t *image =
- vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1);
- if (!image) return image;
-
- for (unsigned int i = 0; i < image->d_h; ++i) {
- memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
- }
- const unsigned int uv_h = (image->d_h + 1) / 2;
- const unsigned int uv_w = (image->d_w + 1) / 2;
- for (unsigned int i = 0; i < uv_h; ++i) {
- memset(image->planes[1] + i * image->stride[1], 128, uv_w);
- memset(image->planes[2] + i * image->stride[2], 128, uv_w);
- }
-
- return image;
-}
-
// Emulates the WebCodecs VideoEncoder interface.
class VP9Encoder {
public:
- explicit VP9Encoder(int speed) : speed_(speed) {}
+ explicit VP9Encoder(int speed)
+ : speed_(speed), bit_depth_(VPX_BITS_8), fmt_(VPX_IMG_FMT_I420) {}
+ // The image format `fmt` must not have the VPX_IMG_FMT_HIGHBITDEPTH bit set.
+ // If bit_depth > 8, we will set the VPX_IMG_FMT_HIGHBITDEPTH bit before
+ // passing the image format to vpx_img_alloc().
+ VP9Encoder(int speed, vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt)
+ : speed_(speed), bit_depth_(bit_depth), fmt_(fmt) {}
~VP9Encoder();
void Configure(unsigned int threads, unsigned int width, unsigned int height,
@@ -558,6 +889,8 @@ class VP9Encoder {
private:
const int speed_;
+ const vpx_bit_depth_t bit_depth_;
+ const vpx_img_fmt_t fmt_;
bool initialized_ = false;
vpx_codec_enc_cfg_t cfg_;
vpx_codec_ctx_t enc_;
@@ -577,12 +910,22 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
deadline_ = deadline;
if (!initialized_) {
+ ASSERT_EQ(fmt_ & VPX_IMG_FMT_HIGHBITDEPTH, 0);
+ const bool high_bit_depth = bit_depth_ > VPX_BITS_8;
+ const bool is_420 = fmt_ == VPX_IMG_FMT_I420;
vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
VPX_CODEC_OK);
cfg_.g_threads = threads;
+ // In profiles 0 and 2, only 4:2:0 format is allowed. In profiles 1 and 3,
+ // all other subsampling formats are allowed. In profiles 0 and 1, only bit
+ // depth 8 is allowed. In profiles 2 and 3, only bit depths 10 and 12 are
+ // allowed.
+ cfg_.g_profile = 2 * high_bit_depth + !is_420;
cfg_.g_w = width;
cfg_.g_h = height;
+ cfg_.g_bit_depth = bit_depth_;
+ cfg_.g_input_bit_depth = bit_depth_;
cfg_.g_timebase.num = 1;
cfg_.g_timebase.den = 1000 * 1000; // microseconds
cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -590,7 +933,10 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
cfg_.rc_end_usage = end_usage;
cfg_.rc_min_quantizer = 2;
cfg_.rc_max_quantizer = 58;
- ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+ ASSERT_EQ(
+ vpx_codec_enc_init(&enc_, iface, &cfg_,
+ high_bit_depth ? VPX_CODEC_USE_HIGHBITDEPTH : 0),
+ VPX_CODEC_OK);
ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
initialized_ = true;
return;
@@ -606,13 +952,13 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
void VP9Encoder::Encode(bool key_frame) {
const vpx_codec_cx_pkt_t *pkt;
- vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h);
+ vpx_image_t *image = CreateImage(bit_depth_, fmt_, cfg_.g_w, cfg_.g_h);
ASSERT_NE(image, nullptr);
const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
ASSERT_EQ(
vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_),
VPX_CODEC_OK);
- frame_index_++;
+ ++frame_index_;
vpx_codec_iter_t iter = nullptr;
while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
@@ -944,6 +1290,28 @@ TEST(EncodeAPI, Buganizer311294795) {
encoder.Encode(false);
encoder.Encode(false);
}
+
+TEST(EncodeAPI, Buganizer317105128) {
+ VP9Encoder encoder(-9);
+ encoder.Configure(0, 1, 1, VPX_CBR, VPX_DL_GOOD_QUALITY);
+ encoder.Configure(16, 1920, 1, VPX_CBR, VPX_DL_REALTIME);
+}
+
+TEST(EncodeAPI, Buganizer319964497) {
+ VP9Encoder encoder(7);
+ encoder.Configure(/*threads=*/1, /*width=*/320, /*height=*/240, VPX_VBR,
+ VPX_DL_REALTIME);
+ encoder.Encode(/*key_frame=*/true);
+ encoder.Encode(/*key_frame=*/true);
+ encoder.Encode(/*key_frame=*/false);
+ encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1, VPX_VBR,
+ VPX_DL_REALTIME);
+ encoder.Encode(/*key_frame=*/false);
+ encoder.Configure(/*threads=*/1, /*width=*/2, /*height=*/2, VPX_CBR,
+ VPX_DL_REALTIME);
+ encoder.Encode(/*key_frame=*/false);
+}
+
#endif // CONFIG_VP9_ENCODER
} // namespace
diff --git a/media/libvpx/libvpx/test/frame_size_tests.cc b/media/libvpx/libvpx/test/frame_size_tests.cc
index eea5647a78..6306e4f2ca 100644
--- a/media/libvpx/libvpx/test/frame_size_tests.cc
+++ b/media/libvpx/libvpx/test/frame_size_tests.cc
@@ -193,7 +193,7 @@ TEST_F(VP9FrameSizeTestsLarge, ValidSizes) {
// size or almost 1 gig of memory.
// In total the allocations will exceed 2GiB which may cause a failure with
// mingw + wine, use a smaller size in that case.
-#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__)
+#if defined(_WIN32) && !defined(_WIN64)
video.SetSize(4096, 3072);
#else
video.SetSize(4096, 4096);
diff --git a/media/libvpx/libvpx/test/init_vpx_test.cc b/media/libvpx/libvpx/test/init_vpx_test.cc
index f66f00b5c1..353c5043eb 100644
--- a/media/libvpx/libvpx/test/init_vpx_test.cc
+++ b/media/libvpx/libvpx/test/init_vpx_test.cc
@@ -57,6 +57,9 @@ void init_vpx_test() {
if (!(caps & HAS_SVE)) {
append_negative_gtest_filter(":SVE.*:SVE/*");
}
+ if (!(caps & HAS_SVE2)) {
+ append_negative_gtest_filter(":SVE2.*:SVE2/*");
+ }
#elif VPX_ARCH_ARM
const int caps = arm_cpu_caps();
if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*");
diff --git a/media/libvpx/libvpx/test/resize_test.cc b/media/libvpx/libvpx/test/resize_test.cc
index 20ad2229b4..f27bd7ebbc 100644
--- a/media/libvpx/libvpx/test/resize_test.cc
+++ b/media/libvpx/libvpx/test/resize_test.cc
@@ -7,8 +7,6 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <stdio.h>
-
#include <climits>
#include <vector>
#include "third_party/googletest/src/include/gtest/gtest.h"
@@ -598,6 +596,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
mismatch_nframes_ = 0;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
unsigned int last_w = cfg_.g_w;
unsigned int last_h = cfg_.g_h;
int resize_count = 0;
@@ -613,12 +612,12 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
}
}
-#if CONFIG_VP9_DECODER
// Verify that we get 1 resize down event in this test.
ASSERT_EQ(1, resize_count) << "Resizing should occur.";
EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
#else
- printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+ GTEST_SKIP()
+ << "Warning: VP9 decoder unavailable, unable to check resize count!\n";
#endif
}
@@ -669,7 +668,8 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
ASSERT_EQ(resize_count, 4) << "Resizing should occur twice.";
EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
#else
- printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+ GTEST_SKIP()
+ << "Warning: VP9 decoder unavailable, unable to check resize count!\n";
#endif
}
diff --git a/media/libvpx/libvpx/test/sum_squares_test.cc b/media/libvpx/libvpx/test/sum_squares_test.cc
index d3c76a34d2..57037f1e30 100644
--- a/media/libvpx/libvpx/test/sum_squares_test.cc
+++ b/media/libvpx/libvpx/test/sum_squares_test.cc
@@ -119,6 +119,13 @@ INSTANTIATE_TEST_SUITE_P(
&vpx_sum_squares_2d_i16_neon)));
#endif // HAVE_NEON
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, SumSquaresTest,
+ ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+ &vpx_sum_squares_2d_i16_sve)));
+#endif // HAVE_SVE
+
#if HAVE_SSE2
INSTANTIATE_TEST_SUITE_P(
SSE2, SumSquaresTest,
diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc
index b8320e9ceb..5cf6a5fb8e 100644
--- a/media/libvpx/libvpx/test/variance_test.cc
+++ b/media/libvpx/libvpx/test/variance_test.cc
@@ -29,6 +29,9 @@ namespace {
typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride);
+typedef void (*GetVarianceFunc)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse, int *sum);
typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
using libvpx_test::ACMRandom;
@@ -63,35 +66,65 @@ static unsigned int mb_ss_ref(const int16_t *src) {
* Our codebase calculates the "diff" value in the variance algorithm by
* (src - ref).
*/
-static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
- int l2h, int src_stride, int ref_stride,
- uint32_t *sse_ptr, bool use_high_bit_depth_,
- vpx_bit_depth_t bit_depth) {
- int64_t se = 0;
- uint64_t sse = 0;
- const int w = 1 << l2w;
- const int h = 1 << l2h;
+static void variance(const uint8_t *src, int src_stride, const uint8_t *ref,
+ int ref_stride, int w, int h, bool use_high_bit_depth_,
+ uint64_t *sse, int64_t *se, vpx_bit_depth_t bit_depth) {
+ int64_t se_long = 0;
+ uint64_t sse_long = 0;
+
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
- int diff;
+ int diff = 0;
if (!use_high_bit_depth_) {
diff = src[y * src_stride + x] - ref[y * ref_stride + x];
- se += diff;
- sse += diff * diff;
#if CONFIG_VP9_HIGHBITDEPTH
} else {
diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] -
CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x];
- se += diff;
- sse += diff * diff;
#endif // CONFIG_VP9_HIGHBITDEPTH
}
+ se_long += diff;
+ sse_long += diff * diff;
}
}
- RoundHighBitDepth(bit_depth, &se, &sse);
- *sse_ptr = static_cast<uint32_t>(sse);
+
+ RoundHighBitDepth(bit_depth, &se_long, &sse_long);
+
+ *sse = sse_long;
+ *se = se_long;
+}
+
+static void get_variance_ref(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int l2w,
+ int l2h, bool use_high_bit_depth_, uint32_t *sse,
+ int *se, vpx_bit_depth_t bit_depth) {
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+ int64_t se_long = 0;
+ uint64_t sse_long = 0;
+
+ variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_,
+ &sse_long, &se_long, bit_depth);
+
+ *sse = static_cast<uint32_t>(sse_long);
+ *se = static_cast<int>(se_long);
+}
+
+static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
+ int l2h, int src_stride, int ref_stride,
+ uint32_t *sse_ptr, bool use_high_bit_depth_,
+ vpx_bit_depth_t bit_depth) {
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+ int64_t se_long = 0;
+ uint64_t sse_long = 0;
+
+ variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_,
+ &sse_long, &se_long, bit_depth);
+
+ *sse_ptr = static_cast<uint32_t>(sse_long);
return static_cast<uint32_t>(
- sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+ sse_long - ((static_cast<int64_t>(se_long) * se_long) >> (l2w + l2h)));
}
/* The subpel reference functions differ from the codec version in one aspect:
@@ -337,6 +370,9 @@ class MainTestClass
void OneQuarterTest();
void SpeedTest();
+ // GetVariance tests
+ void RefTestGetVar();
+
// MSE/SSE tests
void RefTestMse();
void RefTestSse();
@@ -493,6 +529,35 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
}
////////////////////////////////////////////////////////////////////////////////
+// Tests related to GetVariance.
+template <typename GetVarianceFunctionType>
+void MainTestClass<GetVarianceFunctionType>::RefTestGetVar() {
+ for (int i = 0; i < 10; ++i) {
+ for (int j = 0; j < block_size(); j++) {
+ if (!use_high_bit_depth()) {
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ }
+ unsigned int sse1, sse2;
+ int sum1, sum2;
+ const int stride = width();
+ ASM_REGISTER_STATE_CHECK(
+ params_.func(src_, stride, ref_, stride, &sse1, &sum1));
+ get_variance_ref(src_, stride, ref_, stride, params_.log2width,
+ params_.log2height, use_high_bit_depth(), &sse2, &sum2,
+ params_.bit_depth);
+ EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+ EXPECT_EQ(sum1, sum2) << "Error at test index: " << i;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
// Tests related to MSE / SSE.
template <typename FunctionType>
@@ -766,6 +831,7 @@ void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
typedef MainTestClass<Get4x4SseFunc> VpxSseTest;
typedef MainTestClass<vpx_variance_fn_t> VpxMseTest;
typedef MainTestClass<vpx_variance_fn_t> VpxVarianceTest;
+typedef MainTestClass<GetVarianceFunc> VpxGetVarianceTest;
typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxSubpelVarianceTest;
typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxSubpelAvgVarianceTest;
@@ -779,6 +845,7 @@ TEST_P(VpxVarianceTest, Ref) { RefTest(); }
TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(VpxGetVarianceTest, RefGetVar) { RefTestGetVar(); }
TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
@@ -818,6 +885,16 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 3, &vpx_variance4x8_c),
VarianceParams(2, 2, &vpx_variance4x4_c)));
+typedef TestParams<GetVarianceFunc> GetVarianceParams;
+INSTANTIATE_TEST_SUITE_P(
+ C, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_c),
+ GetVarianceParams(3, 3, &vpx_get8x8var_c),
+ GetVarianceParams(4, 4, &vpx_get16x16var_c),
+ GetVarianceParams(3, 3, &vpx_get8x8var_c),
+ GetVarianceParams(4, 4, &vpx_get16x16var_c),
+ GetVarianceParams(3, 3, &vpx_get8x8var_c)));
+
typedef TestParams<vpx_subpixvariance_fn_t> SubpelVarianceParams;
INSTANTIATE_TEST_SUITE_P(
C, VpxSubpelVarianceTest,
@@ -856,6 +933,7 @@ INSTANTIATE_TEST_SUITE_P(
#if CONFIG_VP9_HIGHBITDEPTH
typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest;
+typedef MainTestClass<GetVarianceFunc> VpxHBDGetVarianceTest;
typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest;
typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t>
VpxHBDSubpelAvgVarianceTest;
@@ -865,6 +943,7 @@ TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(VpxHBDGetVarianceTest, RefGetVar) { RefTestGetVar(); }
TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
@@ -933,6 +1012,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
INSTANTIATE_TEST_SUITE_P(
+ C, VpxHBDGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_c, 12),
+ GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_c, 12),
+ GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_c, 10),
+ GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_c, 10),
+ GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_c, 8),
+ GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_c, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
C, VpxHBDSubpelVarianceTest,
::testing::Values(
SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
@@ -1119,6 +1207,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 2, &vpx_variance4x4_sse2)));
INSTANTIATE_TEST_SUITE_P(
+ SSE2, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+ GetVarianceParams(3, 3, &vpx_get8x8var_sse2),
+ GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+ GetVarianceParams(3, 3, &vpx_get8x8var_sse2),
+ GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+ GetVarianceParams(3, 3, &vpx_get8x8var_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
SSE2, VpxSubpelVarianceTest,
::testing::Values(
SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
@@ -1198,6 +1295,16 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sse2, 8)));
INSTANTIATE_TEST_SUITE_P(
+ SSE2, VpxHBDGetVarianceTest,
+ ::testing::Values(
+ GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sse2, 12),
+ GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sse2, 12),
+ GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sse2, 10),
+ GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sse2, 10),
+ GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sse2, 8),
+ GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sse2, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
SSE2, VpxHBDSubpelVarianceTest,
::testing::Values(
SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2,
@@ -1475,6 +1582,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 3, &vpx_variance4x8_neon),
VarianceParams(2, 2, &vpx_variance4x4_neon)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon),
+ GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon),
+ GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon)));
+
#if HAVE_NEON_DOTPROD
INSTANTIATE_TEST_SUITE_P(
NEON_DOTPROD, VpxSseTest,
@@ -1502,6 +1618,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod),
VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod),
VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod),
+ GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod),
+ GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod)));
#endif // HAVE_NEON_DOTPROD
INSTANTIATE_TEST_SUITE_P(
@@ -1555,9 +1680,6 @@ INSTANTIATE_TEST_SUITE_P(
MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
-// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can
-// be used again.
-#if 0
#if HAVE_NEON_DOTPROD
INSTANTIATE_TEST_SUITE_P(
NEON_DOTPROD, VpxHBDMseTest,
@@ -1567,7 +1689,19 @@ INSTANTIATE_TEST_SUITE_P(
MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
#endif // HAVE_NEON_DOTPROD
-#endif // 0
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, VpxHBDMseTest,
+ ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sve, VPX_BITS_12),
+ MseParams(4, 3, &vpx_highbd_12_mse16x8_sve, VPX_BITS_12),
+ MseParams(3, 4, &vpx_highbd_12_mse8x16_sve, VPX_BITS_12),
+ MseParams(3, 3, &vpx_highbd_12_mse8x8_sve, VPX_BITS_12),
+ MseParams(4, 4, &vpx_highbd_10_mse16x16_sve, VPX_BITS_10),
+ MseParams(4, 3, &vpx_highbd_10_mse16x8_sve, VPX_BITS_10),
+ MseParams(3, 4, &vpx_highbd_10_mse8x16_sve, VPX_BITS_10),
+ MseParams(3, 3, &vpx_highbd_10_mse8x8_sve, VPX_BITS_10)));
+#endif // HAVE_SVE
INSTANTIATE_TEST_SUITE_P(
NEON, VpxHBDVarianceTest,
@@ -1613,6 +1747,28 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8)));
INSTANTIATE_TEST_SUITE_P(
+ NEON, VpxHBDGetVarianceTest,
+ ::testing::Values(
+ GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_neon, 12),
+ GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_neon, 12),
+ GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_neon, 10),
+ GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_neon, 10),
+ GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_neon, 8),
+ GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_neon, 8)));
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, VpxHBDGetVarianceTest,
+ ::testing::Values(
+ GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sve, 12),
+ GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sve, 12),
+ GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sve, 10),
+ GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sve, 10),
+ GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sve, 8),
+ GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sve, 8)));
+#endif // HAVE_SVE
+
+INSTANTIATE_TEST_SUITE_P(
NEON, VpxHBDSubpelVarianceTest,
::testing::Values(
SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon,
@@ -1815,6 +1971,53 @@ INSTANTIATE_TEST_SUITE_P(
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_NEON
+#if HAVE_SVE
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ SVE, VpxHBDVarianceTest,
+ ::testing::Values(
+ VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sve, 12),
+ VarianceParams(6, 5, &vpx_highbd_12_variance64x32_sve, 12),
+ VarianceParams(5, 6, &vpx_highbd_12_variance32x64_sve, 12),
+ VarianceParams(5, 5, &vpx_highbd_12_variance32x32_sve, 12),
+ VarianceParams(5, 4, &vpx_highbd_12_variance32x16_sve, 12),
+ VarianceParams(4, 5, &vpx_highbd_12_variance16x32_sve, 12),
+ VarianceParams(4, 4, &vpx_highbd_12_variance16x16_sve, 12),
+ VarianceParams(4, 3, &vpx_highbd_12_variance16x8_sve, 12),
+ VarianceParams(3, 4, &vpx_highbd_12_variance8x16_sve, 12),
+ VarianceParams(3, 3, &vpx_highbd_12_variance8x8_sve, 12),
+ VarianceParams(3, 2, &vpx_highbd_12_variance8x4_sve, 12),
+ VarianceParams(2, 3, &vpx_highbd_12_variance4x8_sve, 12),
+ VarianceParams(2, 2, &vpx_highbd_12_variance4x4_sve, 12),
+ VarianceParams(6, 6, &vpx_highbd_10_variance64x64_sve, 10),
+ VarianceParams(6, 5, &vpx_highbd_10_variance64x32_sve, 10),
+ VarianceParams(5, 6, &vpx_highbd_10_variance32x64_sve, 10),
+ VarianceParams(5, 5, &vpx_highbd_10_variance32x32_sve, 10),
+ VarianceParams(5, 4, &vpx_highbd_10_variance32x16_sve, 10),
+ VarianceParams(4, 5, &vpx_highbd_10_variance16x32_sve, 10),
+ VarianceParams(4, 4, &vpx_highbd_10_variance16x16_sve, 10),
+ VarianceParams(4, 3, &vpx_highbd_10_variance16x8_sve, 10),
+ VarianceParams(3, 4, &vpx_highbd_10_variance8x16_sve, 10),
+ VarianceParams(3, 3, &vpx_highbd_10_variance8x8_sve, 10),
+ VarianceParams(3, 2, &vpx_highbd_10_variance8x4_sve, 10),
+ VarianceParams(2, 3, &vpx_highbd_10_variance4x8_sve, 10),
+ VarianceParams(2, 2, &vpx_highbd_10_variance4x4_sve, 10),
+ VarianceParams(6, 6, &vpx_highbd_8_variance64x64_sve, 8),
+ VarianceParams(6, 5, &vpx_highbd_8_variance64x32_sve, 8),
+ VarianceParams(5, 6, &vpx_highbd_8_variance32x64_sve, 8),
+ VarianceParams(5, 5, &vpx_highbd_8_variance32x32_sve, 8),
+ VarianceParams(5, 4, &vpx_highbd_8_variance32x16_sve, 8),
+ VarianceParams(4, 5, &vpx_highbd_8_variance16x32_sve, 8),
+ VarianceParams(4, 4, &vpx_highbd_8_variance16x16_sve, 8),
+ VarianceParams(4, 3, &vpx_highbd_8_variance16x8_sve, 8),
+ VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sve, 8),
+ VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sve, 8),
+ VarianceParams(3, 2, &vpx_highbd_8_variance8x4_sve, 8),
+ VarianceParams(2, 3, &vpx_highbd_8_variance4x8_sve, 8),
+ VarianceParams(2, 2, &vpx_highbd_8_variance4x4_sve, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_SVE
+
#if HAVE_MSA
INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest,
::testing::Values(vpx_get_mb_ss_msa));
@@ -1846,6 +2049,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 2, &vpx_variance4x4_msa)));
INSTANTIATE_TEST_SUITE_P(
+ MSA, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+ GetVarianceParams(3, 3, &vpx_get8x8var_msa),
+ GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+ GetVarianceParams(3, 3, &vpx_get8x8var_msa),
+ GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+ GetVarianceParams(3, 3, &vpx_get8x8var_msa)));
+
+INSTANTIATE_TEST_SUITE_P(
MSA, VpxSubpelVarianceTest,
::testing::Values(
SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
@@ -1908,6 +2120,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(3, 2, &vpx_variance8x4_vsx),
VarianceParams(2, 3, &vpx_variance4x8_vsx),
VarianceParams(2, 2, &vpx_variance4x4_vsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+ VSX, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+ GetVarianceParams(3, 3, &vpx_get8x8var_vsx),
+ GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+ GetVarianceParams(3, 3, &vpx_get8x8var_vsx),
+ GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+ GetVarianceParams(3, 3, &vpx_get8x8var_vsx)));
#endif // HAVE_VSX
#if HAVE_MMI
diff --git a/media/libvpx/libvpx/test/video_source.h b/media/libvpx/libvpx/test/video_source.h
index 2194126f1f..2c035910db 100644
--- a/media/libvpx/libvpx/test/video_source.h
+++ b/media/libvpx/libvpx/test/video_source.h
@@ -236,7 +236,6 @@ class RandomVideoSource : public DummyVideoSource {
RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
: rnd_(seed), seed_(seed) {}
- protected:
// Reset the RNG to get a matching stream for the second pass
void Begin() override {
frame_ = 0;
@@ -244,6 +243,7 @@ class RandomVideoSource : public DummyVideoSource {
FillFrame();
}
+ protected:
// 15 frames of noise, followed by 15 static frames. Reset to 0 rather
// than holding previous frames to encourage keyframes to be thrown.
void FillFrame() override {
diff --git a/media/libvpx/libvpx/test/vp8_datarate_test.cc b/media/libvpx/libvpx/test/vp8_datarate_test.cc
index aee27af66e..d47ed298fe 100644
--- a/media/libvpx/libvpx/test/vp8_datarate_test.cc
+++ b/media/libvpx/libvpx/test/vp8_datarate_test.cc
@@ -14,7 +14,7 @@
#include "test/i420_video_source.h"
#include "test/util.h"
#include "test/y4m_video_source.h"
-#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
namespace {
@@ -260,6 +260,27 @@ class DatarateTestLarge
<< " The datarate for the file missed the target!";
}
+ virtual void MultiThreadsPSNRTest() {
+ denoiser_on_ = 0;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_threads = 4;
+ init_flags_ = VPX_CODEC_USE_PSNR;
+
+ ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+ 1280, 720, 30, 1, 0, 30);
+ cfg_.rc_target_bitrate = 1000;
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.5)
+ << " The datarate for the file exceeds the target!";
+
+ ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 2.0)
+ << " The datarate for the file missed the target!";
+ }
+
vpx_codec_pts_t last_pts_;
int64_t bits_in_buffer_model_;
double timebase_;
@@ -324,6 +345,8 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
DropFramesMultiThreadsTest();
}
+TEST_P(DatarateTestRealTime, MultiThreadsPSNR) { MultiThreadsPSNRTest(); }
+
TEST_P(DatarateTestRealTime, RegionOfInterest) {
denoiser_on_ = 0;
cfg_.rc_buf_initial_sz = 500;
diff --git a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
index 50478f7635..d87fef5a46 100644
--- a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
+++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
@@ -149,9 +149,16 @@ class Vp8RcInterfaceTest
return;
}
int qp;
+ libvpx::UVDeltaQP uv_delta_qp;
encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
ASSERT_EQ(rc_api_->GetQP(), qp);
+ uv_delta_qp = rc_api_->GetUVDeltaQP();
+ // delta_qp for UV channel is only set for screen.
+ if (!rc_cfg_.is_screen) {
+ ASSERT_EQ(uv_delta_qp.uvdc_delta_q, 0);
+ ASSERT_EQ(uv_delta_qp.uvac_delta_q, 0);
+ }
} else {
num_drops_++;
}
diff --git a/media/libvpx/libvpx/test/vp9_block_error_test.cc b/media/libvpx/libvpx/test/vp9_block_error_test.cc
index 0645341ac1..c5ddcd58ab 100644
--- a/media/libvpx/libvpx/test/vp9_block_error_test.cc
+++ b/media/libvpx/libvpx/test/vp9_block_error_test.cc
@@ -215,4 +215,13 @@ const BlockErrorParam neon_block_error_tests[] = {
INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest,
::testing::ValuesIn(neon_block_error_tests));
#endif // HAVE_NEON
+
+#if HAVE_SVE
+const BlockErrorParam sve_block_error_tests[] = { make_tuple(
+ &BlockError8BitWrapper<vp9_block_error_sve>,
+ &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8) };
+
+INSTANTIATE_TEST_SUITE_P(SVE, BlockErrorTest,
+ ::testing::ValuesIn(sve_block_error_tests));
+#endif // HAVE_SVE
} // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
index 33fa05c65c..5c23a5b0d5 100644
--- a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
@@ -10,115 +10,78 @@
#include <cstdint>
#include <new>
+#include <memory>
+
+#include "./vpx_config.h"
#include "test/codec_factory.h"
#include "test/encode_test_driver.h"
#include "test/util.h"
#include "test/yuv_video_source.h"
#include "third_party/googletest/src/include/gtest/gtest.h"
+#if CONFIG_VP9_DECODER
+#include "vpx/vp8dx.h"
+#endif
#include "vp9/simple_encode.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_image.h"
#include "vpx/vpx_tpl.h"
#include "vpx_dsp/vpx_dsp_common.h"
namespace {
-constexpr int kModelMagicNumber = 51396;
-constexpr uintptr_t PrivMagicNumber = 5566;
-constexpr int kFrameNum = 5;
-constexpr int kFrameNumGOP = 30;
-constexpr int kFrameNumGOPShort = 4;
-constexpr int kLosslessCodingIndex = 2;
-constexpr int kFixedGOPSize = 9;
-// The range check in vp9_cx_iface.c shows that the max
-// lag in buffer is MAX_LAG_BUFFERS (25):
-// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
-constexpr int kMaxLagInFrames = 25;
-constexpr int kDefaultMinGfInterval = 4;
-constexpr int kDefaultMaxGfInterval = 16;
-// The active gf interval might change for each GOP
-// See function "get_active_gf_inverval_range".
-// The numbers below are from manual inspection.
-constexpr int kReadMinGfInterval = 5;
-constexpr int kReadMaxGfInterval = 13;
-const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
-const double kPsnrThreshold = 30.4;
-
-struct ToyRateCtrl {
- int magic_number;
- int coding_index;
-
- int gop_global_index;
- int frames_since_key;
- int show_index;
+constexpr int kFrameNum = 10;
+constexpr int kFixedGOPSize = 10;
+constexpr int kKeyframeQp = 10;
+constexpr int kLeafQp = 40;
+constexpr int kArfQp = 15;
+
+// Simple external rate controller for testing.
+class RateControllerForTest {
+ public:
+ RateControllerForTest() : current_gop_(-1) {}
+ ~RateControllerForTest() {}
+
+ void StartNextGop() { ++current_gop_; }
+
+ vpx_rc_gop_decision_t GetCurrentGop() const {
+ vpx_rc_gop_decision_t gop_decision;
+ gop_decision.use_key_frame = current_gop_ == 0 ? 1 : 0;
+ gop_decision.use_alt_ref = 1;
+ gop_decision.gop_coding_frames = kFixedGOPSize;
+ return gop_decision;
+ }
+
+ int CalculateFrameDecision(int frame_index) {
+ EXPECT_LE(frame_index, kFixedGOPSize);
+ if (current_gop_ == 0 && frame_index == 0) {
+ // Key frame, first frame in the first GOP.
+ return kKeyframeQp;
+ } else if (frame_index == 1) {
+ // ARF, we always use ARF for this test.
+ return kArfQp;
+ } else {
+ return kLeafQp;
+ }
+ }
+ int current_gop_;
};
-vpx_rc_status_t rc_create_model(void *priv,
- const vpx_rc_config_t *ratectrl_config,
- vpx_rc_model_t *rate_ctrl_model_ptr) {
- ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
- if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
- toy_rate_ctrl->magic_number = kModelMagicNumber;
- toy_rate_ctrl->coding_index = -1;
- *rate_ctrl_model_ptr = toy_rate_ctrl;
- EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
- EXPECT_EQ(ratectrl_config->frame_width, 352);
- EXPECT_EQ(ratectrl_config->frame_height, 288);
- EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNum);
- EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000);
- EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
- EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_create_model_gop(void *priv,
- const vpx_rc_config_t *ratectrl_config,
- vpx_rc_model_t *rate_ctrl_model_ptr) {
- ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
- if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
- toy_rate_ctrl->magic_number = kModelMagicNumber;
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- toy_rate_ctrl->show_index = 0;
- toy_rate_ctrl->coding_index = 0;
- *rate_ctrl_model_ptr = toy_rate_ctrl;
- EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
- EXPECT_EQ(ratectrl_config->frame_width, 640);
- EXPECT_EQ(ratectrl_config->frame_height, 360);
- EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP);
- EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000);
- EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
- EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_create_model_gop_short(
- void *priv, const vpx_rc_config_t *ratectrl_config,
+// Callbacks used in this test.
+vpx_rc_status_t rc_test_create_model(
+ void * /*priv*/, const vpx_rc_config_t * /*ratectrl_config*/,
vpx_rc_model_t *rate_ctrl_model_ptr) {
- ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
- if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
- toy_rate_ctrl->magic_number = kModelMagicNumber;
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- toy_rate_ctrl->show_index = 0;
- toy_rate_ctrl->coding_index = 0;
- *rate_ctrl_model_ptr = toy_rate_ctrl;
- EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
- EXPECT_EQ(ratectrl_config->frame_width, 352);
- EXPECT_EQ(ratectrl_config->frame_height, 288);
- EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort);
- EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500);
- EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
- EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+ std::unique_ptr<RateControllerForTest> test_controller(
+ new RateControllerForTest());
+ *rate_ctrl_model_ptr = test_controller.release();
return VPX_RC_OK;
}
-vpx_rc_status_t rc_send_firstpass_stats(
- vpx_rc_model_t rate_ctrl_model,
+vpx_rc_status_t rc_test_send_firstpass_stats(
+ vpx_rc_model_t /*rate_ctrl_model*/,
const vpx_rc_firstpass_stats_t *first_pass_stats) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
EXPECT_EQ(first_pass_stats->num_frames, kFrameNum);
for (int i = 0; i < first_pass_stats->num_frames; ++i) {
EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
@@ -126,37 +89,8 @@ vpx_rc_status_t rc_send_firstpass_stats(
return VPX_RC_OK;
}
-vpx_rc_status_t rc_send_firstpass_stats_gop(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_firstpass_stats_t *first_pass_stats) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP);
- for (int i = 0; i < first_pass_stats->num_frames; ++i) {
- EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
- }
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_send_firstpass_stats_gop_short(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_firstpass_stats_t *first_pass_stats) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort);
- for (int i = 0; i < first_pass_stats->num_frames; ++i) {
- EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
- }
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
- const VpxTplGopStats *tpl_gop_stats) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+vpx_rc_status_t rc_test_send_tpl_gop_stats(
+ vpx_rc_model_t /*rate_ctrl_model*/, const VpxTplGopStats *tpl_gop_stats) {
EXPECT_GT(tpl_gop_stats->size, 0);
for (int i = 0; i < tpl_gop_stats->size; ++i) {
@@ -165,522 +99,38 @@ vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
return VPX_RC_OK;
}
-vpx_rc_status_t rc_get_encodeframe_decision(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
+vpx_rc_status_t rc_test_get_encodeframe_decision(
+ vpx_rc_model_t rate_ctrl_model, const int frame_gop_index,
vpx_rc_encodeframe_decision_t *frame_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- toy_rate_ctrl->coding_index += 1;
-
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
- EXPECT_LT(encode_frame_info->show_index, kFrameNum);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- if (encode_frame_info->coding_index == 0) {
- EXPECT_EQ(encode_frame_info->show_index, 0);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- } else if (encode_frame_info->coding_index == 1) {
- EXPECT_EQ(encode_frame_info->show_index, 4);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 0); // kRefFrameTypeLast
- } else if (encode_frame_info->coding_index >= 2 &&
- encode_frame_info->coding_index < 5) {
- // In the first group of pictures, coding_index and gop_index are equal.
- EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- } else if (encode_frame_info->coding_index == 5) {
- EXPECT_EQ(encode_frame_info->show_index, 4);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 1); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 1); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 4); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[2],
- 1); // kRefFrameTypeFuture
- }
- if (encode_frame_info->coding_index == kLosslessCodingIndex) {
- // We should get sse == 0 at rc_update_encodeframe_result()
- frame_decision->q_index = 0;
- } else {
- frame_decision->q_index = 100;
- }
- frame_decision->max_frame_size = 0;
+ RateControllerForTest *test_controller =
+ static_cast<RateControllerForTest *>(rate_ctrl_model);
+ frame_decision->q_index =
+ test_controller->CalculateFrameDecision(frame_gop_index);
return VPX_RC_OK;
}
-vpx_rc_status_t rc_get_encodeframe_decision_gop(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
- vpx_rc_encodeframe_decision_t *frame_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- if (encode_frame_info->coding_index == 0) {
- EXPECT_EQ(encode_frame_info->show_index, 0);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- } else if (encode_frame_info->coding_index == 1) {
- EXPECT_EQ(encode_frame_info->show_index, 1);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 0); // kRefFrameTypeLast
- } else if (encode_frame_info->coding_index == 2) {
- EXPECT_EQ(encode_frame_info->show_index, 2);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- } else if (encode_frame_info->coding_index == 3 ||
- encode_frame_info->coding_index == 12 ||
- encode_frame_info->coding_index == 21) {
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- } else if (encode_frame_info->coding_index == 11 ||
- encode_frame_info->coding_index == 20 ||
- encode_frame_info->coding_index == 29) {
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- } else if (encode_frame_info->coding_index >= 30) {
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- }
-
- // When the model recommends an invalid q, valid range [0, 255],
- // the encoder will ignore it and use the default q selected
- // by libvpx rate control strategy.
- frame_decision->q_index = VPX_DEFAULT_Q;
- frame_decision->max_frame_size = 0;
-
- toy_rate_ctrl->coding_index += 1;
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
- vpx_rc_encodeframe_decision_t *frame_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- if (encode_frame_info->coding_index == 0) {
- EXPECT_EQ(encode_frame_info->show_index, 0);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 1) {
- EXPECT_EQ(encode_frame_info->show_index, 1);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 2) {
- EXPECT_EQ(encode_frame_info->show_index, 2);
- EXPECT_EQ(encode_frame_info->gop_index, 2);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 3) {
- EXPECT_EQ(encode_frame_info->show_index, 3);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
- }
-
- // When the model recommends an invalid q, valid range [0, 255],
- // the encoder will ignore it and use the default q selected
- // by libvpx rate control strategy.
- frame_decision->q_index = VPX_DEFAULT_Q;
- frame_decision->max_frame_size = 0;
-
- toy_rate_ctrl->coding_index += 1;
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
- vpx_rc_encodeframe_decision_t *frame_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- if (encode_frame_info->coding_index == 0) {
- EXPECT_EQ(encode_frame_info->show_index, 0);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 1) {
- EXPECT_EQ(encode_frame_info->show_index, 3);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 2) {
- EXPECT_EQ(encode_frame_info->show_index, 1);
- EXPECT_EQ(encode_frame_info->gop_index, 2);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 3) {
- EXPECT_EQ(encode_frame_info->show_index, 2);
- EXPECT_EQ(encode_frame_info->gop_index, 3);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 4) {
- EXPECT_EQ(encode_frame_info->show_index, 3);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- }
-
- // When the model recommends an invalid q, valid range [0, 255],
- // the encoder will ignore it and use the default q selected
- // by libvpx rate control strategy.
- frame_decision->q_index = VPX_DEFAULT_Q;
- frame_decision->max_frame_size = 0;
-
- toy_rate_ctrl->coding_index += 1;
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
- vpx_rc_encodeframe_decision_t *frame_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- if (encode_frame_info->coding_index == 0) {
- EXPECT_EQ(encode_frame_info->show_index, 0);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 1) {
- EXPECT_EQ(encode_frame_info->show_index, 1);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 2) {
- EXPECT_EQ(encode_frame_info->show_index, 2);
- EXPECT_EQ(encode_frame_info->gop_index, 2);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 3) {
- EXPECT_EQ(encode_frame_info->show_index, 3);
- EXPECT_EQ(encode_frame_info->gop_index, 3);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- }
-
- // When the model recommends an invalid q, valid range [0, 255],
- // the encoder will ignore it and use the default q selected
- // by libvpx rate control strategy.
- frame_decision->q_index = VPX_DEFAULT_Q;
- frame_decision->max_frame_size = 0;
-
- toy_rate_ctrl->coding_index += 1;
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_gop_info_t *gop_info,
- vpx_rc_gop_decision_t *gop_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames);
- EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
- EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
- EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval);
- EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval);
- EXPECT_EQ(gop_info->allow_alt_ref, 1);
- if (gop_info->is_key_frame) {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- EXPECT_EQ(gop_info->frames_since_key, 0);
- EXPECT_EQ(gop_info->gop_global_index, 0);
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- } else {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
- }
- EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
- EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
- EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
- EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
- gop_decision->gop_coding_frames =
- VPXMIN(kFixedGOPSize, gop_info->frames_to_key);
- gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize;
- toy_rate_ctrl->frames_since_key +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- toy_rate_ctrl->show_index +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- ++toy_rate_ctrl->gop_global_index;
- return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 2 GOPs.
-// The first GOP has 3 coding frames, no alt ref.
-// The second GOP has 1 coding frame, no alt ref.
-vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_gop_info_t *gop_info,
- vpx_rc_gop_decision_t *gop_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
- EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
- EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
- EXPECT_EQ(gop_info->allow_alt_ref, 1);
- if (gop_info->is_key_frame) {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- EXPECT_EQ(gop_info->frames_since_key, 0);
- EXPECT_EQ(gop_info->gop_global_index, 0);
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- } else {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- }
- EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
- EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
- EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
- EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
- gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1;
- gop_decision->use_alt_ref = 0;
- toy_rate_ctrl->frames_since_key +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- toy_rate_ctrl->show_index +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- ++toy_rate_ctrl->gop_global_index;
- return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 2 GOPs.
-// The first GOP has 4 coding frames. Use alt ref.
-// The second GOP only contains the overlay frame of the first GOP's alt ref
-// frame.
-vpx_rc_status_t rc_get_gop_decision_short_overlay(
- vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
- vpx_rc_gop_decision_t *gop_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
- EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
- EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
- EXPECT_EQ(gop_info->allow_alt_ref, 1);
- if (gop_info->is_key_frame) {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- EXPECT_EQ(gop_info->frames_since_key, 0);
- EXPECT_EQ(gop_info->gop_global_index, 0);
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- } else {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
- }
- EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
- EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
- EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
- EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
- gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
- gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0;
- toy_rate_ctrl->frames_since_key +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- toy_rate_ctrl->show_index +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- ++toy_rate_ctrl->gop_global_index;
- return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 1 GOP.
-// The GOP has 4 coding frames. Do not use alt ref.
-vpx_rc_status_t rc_get_gop_decision_short_no_arf(
- vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
- vpx_rc_gop_decision_t *gop_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
- EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
- EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
- EXPECT_EQ(gop_info->allow_alt_ref, 1);
- if (gop_info->is_key_frame) {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- EXPECT_EQ(gop_info->frames_since_key, 0);
- EXPECT_EQ(gop_info->gop_global_index, 0);
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- } else {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- }
- EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
- EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
- EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
- EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
- gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
- gop_decision->use_alt_ref = 0;
- toy_rate_ctrl->frames_since_key +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- toy_rate_ctrl->show_index +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- ++toy_rate_ctrl->gop_global_index;
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_result_t *encode_frame_result) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
- const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
- EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
- if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
- EXPECT_EQ(encode_frame_result->sse, 0);
- }
- if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
- EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0);
- } else {
- EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100);
- }
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result_gop(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_result_t *encode_frame_result) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
- const int64_t ref_pixel_count = 640 * 360 * 3 / 2;
- EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result_gop_short(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_result_t *encode_frame_result) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
- const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
- EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_default_frame_rdmult(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- *rdmult = VPX_DEFAULT_RDMULT;
+vpx_rc_status_t rc_test_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
+ vpx_rc_gop_decision_t *gop_decision) {
+ RateControllerForTest *test_controller =
+ static_cast<RateControllerForTest *>(rate_ctrl_model);
+ test_controller->StartNextGop();
+ *gop_decision = test_controller->GetCurrentGop();
return VPX_RC_OK;
}
vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- delete toy_rate_ctrl;
+ RateControllerForTest *test_controller =
+ static_cast<RateControllerForTest *>(rate_ctrl_model);
+ delete test_controller;
return VPX_RC_OK;
}
class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
public ::testing::Test {
protected:
- ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {}
+ ExtRateCtrlTest()
+ : EncoderTest(&::libvpx_test::kVP9), frame_number_(0),
+ current_frame_qp_(0) {}
~ExtRateCtrlTest() override = default;
@@ -693,287 +143,62 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
::libvpx_test::Encoder *encoder) override {
if (video->frame() == 0) {
vpx_rc_funcs_t rc_funcs = {};
- rc_funcs.rc_type = VPX_RC_QP;
- rc_funcs.create_model = rc_create_model;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
- rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision;
- rc_funcs.update_encodeframe_result = rc_update_encodeframe_result;
- rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
- encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
- }
- }
-};
-
-TEST_F(ExtRateCtrlTest, EncodeTest) {
- cfg_.rc_target_bitrate = 24000;
-
- std::unique_ptr<libvpx_test::VideoSource> video;
- video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
- kFrameNum));
-
- ASSERT_NE(video, nullptr);
- ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
- public ::libvpx_test::CodecTestWithParam<int> {
- protected:
- ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {}
-
- ~ExtRateCtrlTestGOP() override = default;
-
- void SetUp() override {
- InitializeConfig();
- SetMode(::libvpx_test::kTwoPassGood);
- }
-
- void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
- ::libvpx_test::Encoder *encoder) override {
- if (video->frame() == 0) {
- encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
- encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-
- vpx_rc_funcs_t rc_funcs = {};
- rc_funcs.rc_type = VPX_RC_GOP_QP;
- rc_funcs.create_model = rc_create_model_gop;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
- rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats;
- rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop;
- rc_funcs.get_gop_decision = rc_get_gop_decision;
- rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
- rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
- encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
- }
- }
-};
-
-TEST_F(ExtRateCtrlTestGOP, EncodeTest) {
- cfg_.rc_target_bitrate = 4000;
- cfg_.g_lag_in_frames = kMaxLagInFrames;
- cfg_.rc_end_usage = VPX_VBR;
-
- std::unique_ptr<libvpx_test::VideoSource> video;
- video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0,
- kFrameNumGOP));
-
- ASSERT_NE(video, nullptr);
- ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
- public ::libvpx_test::CodecTestWithParam<int> {
- protected:
- ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {}
-
- ~ExtRateCtrlTestGOPShort() override = default;
-
- void SetUp() override {
- InitializeConfig();
- SetMode(::libvpx_test::kTwoPassGood);
- }
-
- void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
- ::libvpx_test::Encoder *encoder) override {
- if (video->frame() == 0) {
- encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
- encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
- encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
- vpx_rc_funcs_t rc_funcs = {};
- rc_funcs.rc_type = VPX_RC_GOP_QP;
- rc_funcs.create_model = rc_create_model_gop_short;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
- rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
- rc_funcs.get_gop_decision = rc_get_gop_decision_short;
- rc_funcs.update_encodeframe_result =
- rc_update_encodeframe_result_gop_short;
- rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
- encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
- }
- }
-};
-
-TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) {
- cfg_.rc_target_bitrate = 500;
- cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
- cfg_.rc_end_usage = VPX_VBR;
-
- std::unique_ptr<libvpx_test::VideoSource> video;
- video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
- ASSERT_NE(video, nullptr);
- ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShortOverlay
- : public ::libvpx_test::EncoderTest,
- public ::libvpx_test::CodecTestWithParam<int> {
- protected:
- ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {}
-
- ~ExtRateCtrlTestGOPShortOverlay() override = default;
-
- void SetUp() override {
- InitializeConfig();
- SetMode(::libvpx_test::kTwoPassGood);
- }
-
- void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
- ::libvpx_test::Encoder *encoder) override {
- if (video->frame() == 0) {
- encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
- encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
- encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
- vpx_rc_funcs_t rc_funcs = {};
rc_funcs.rc_type = VPX_RC_GOP_QP;
- rc_funcs.create_model = rc_create_model_gop_short;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
- rc_funcs.get_encodeframe_decision =
- rc_get_encodeframe_decision_gop_short_overlay;
- rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay;
- rc_funcs.update_encodeframe_result =
- rc_update_encodeframe_result_gop_short;
+ rc_funcs.create_model = rc_test_create_model;
+ rc_funcs.send_firstpass_stats = rc_test_send_firstpass_stats;
+ rc_funcs.send_tpl_gop_stats = rc_test_send_tpl_gop_stats;
+ rc_funcs.get_gop_decision = rc_test_get_gop_decision;
+ rc_funcs.get_encodeframe_decision = rc_test_get_encodeframe_decision;
rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
}
}
-};
-
-TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) {
- cfg_.rc_target_bitrate = 500;
- cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
- cfg_.rc_end_usage = VPX_VBR;
-
- std::unique_ptr<libvpx_test::VideoSource> video;
- video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
- ASSERT_NE(video, nullptr);
- ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShortNoARF
- : public ::libvpx_test::EncoderTest,
- public ::libvpx_test::CodecTestWithParam<int> {
- protected:
- ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {}
-
- ~ExtRateCtrlTestGOPShortNoARF() override = default;
- void SetUp() override {
- InitializeConfig();
- SetMode(::libvpx_test::kTwoPassGood);
+#if CONFIG_VP9_DECODER
+ bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+ const ::libvpx_test::VideoSource & /*video*/,
+ ::libvpx_test::Decoder *decoder) override {
+ EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+ decoder->Control(VPXD_GET_LAST_QUANTIZER, &current_frame_qp_);
+ return VPX_CODEC_OK == res_dec;
}
- void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
- ::libvpx_test::Encoder *encoder) override {
- if (video->frame() == 0) {
- encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
- encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
- encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
- vpx_rc_funcs_t rc_funcs = {};
- rc_funcs.rc_type = VPX_RC_GOP_QP;
- rc_funcs.create_model = rc_create_model_gop_short;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
- rc_funcs.get_encodeframe_decision =
- rc_get_encodeframe_decision_gop_short_no_arf;
- rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf;
- rc_funcs.update_encodeframe_result =
- rc_update_encodeframe_result_gop_short;
- rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
- encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+ void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+ if (frame_number_ == 0) {
+ // This must be a key frame
+ EXPECT_TRUE((pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0);
+ EXPECT_EQ(current_frame_qp_, kKeyframeQp);
+ ++frame_number_;
+ return;
}
- }
-};
-
-TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) {
- cfg_.rc_target_bitrate = 500;
- cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
- cfg_.rc_end_usage = VPX_VBR;
-
- std::unique_ptr<libvpx_test::VideoSource> video;
- video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
- ASSERT_NE(video, nullptr);
- ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
- public ::testing::Test {
- protected:
- ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {}
-
- ~ExtRateCtrlTestRdmult() override = default;
-
- void SetUp() override {
- InitializeConfig();
- SetMode(::libvpx_test::kTwoPassGood);
- }
-
- void BeginPassHook(unsigned int) override {
- psnr_ = 0.0;
- nframes_ = 0;
- }
-
- void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
- psnr_ += pkt->data.psnr.psnr[0];
- nframes_++;
- }
- void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
- ::libvpx_test::Encoder *encoder) override {
- if (video->frame() == 0) {
- vpx_rc_funcs_t rc_funcs = {};
- rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
- rc_funcs.create_model = rc_create_model_gop_short;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
- rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
- rc_funcs.get_gop_decision = rc_get_gop_decision_short;
- rc_funcs.update_encodeframe_result =
- rc_update_encodeframe_result_gop_short;
- rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult;
- rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
- encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+ if ((pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) != 0) {
+ // This is ARF
+ EXPECT_EQ(current_frame_qp_, kArfQp);
+ ++frame_number_;
+ return;
}
- }
- double GetAveragePsnr() const {
- if (nframes_) return psnr_ / nframes_;
- return 0.0;
+ EXPECT_EQ(current_frame_qp_, kLeafQp);
+ ++frame_number_;
}
+#endif // CONFIG_VP9_DECODER
- private:
- double psnr_;
- unsigned int nframes_;
+ int frame_number_;
+ int current_frame_qp_;
};
-TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) {
- cfg_.rc_target_bitrate = 500;
- cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
- cfg_.rc_end_usage = VPX_VBR;
- init_flags_ = VPX_CODEC_USE_PSNR;
+TEST_F(ExtRateCtrlTest, EncodeTest) {
+ cfg_.rc_target_bitrate = 4000;
+ cfg_.g_lag_in_frames = 25;
std::unique_ptr<libvpx_test::VideoSource> video;
video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+ "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
+ kFrameNum));
ASSERT_NE(video, nullptr);
ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-
- const double psnr = GetAveragePsnr();
- EXPECT_GT(psnr, kPsnrThreshold);
}
} // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
index f7be47542c..a6c7563348 100644
--- a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
@@ -9,6 +9,7 @@
*/
#include "vp9/ratectrl_rtc.h"
+#include <climits>
#include <fstream> // NOLINT
#include <string>
@@ -19,6 +20,8 @@
#include "test/i420_video_source.h"
#include "test/util.h"
#include "test/video_source.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
#include "vpx/vpx_codec.h"
#include "vpx_ports/bitops.h"
diff --git a/media/libvpx/libvpx/test/vp9_scale_test.cc b/media/libvpx/libvpx/test/vp9_scale_test.cc
index 049a10a617..a5a18a7e9d 100644
--- a/media/libvpx/libvpx/test/vp9_scale_test.cc
+++ b/media/libvpx/libvpx/test/vp9_scale_test.cc
@@ -48,12 +48,11 @@ class ScaleTest : public VpxScaleBase,
}
void RunTest(INTERP_FILTER filter_type) {
- static const int kNumSizesToTest = 20;
+ static const int kNumSizesToTest = 22;
static const int kNumScaleFactorsToTest = 4;
- static const int kSizesToTest[] = {
- 2, 4, 6, 8, 10, 12, 14, 16, 18, 20,
- 22, 24, 26, 28, 30, 32, 34, 68, 128, 134
- };
+ static const int kSizesToTest[] = { 1, 2, 3, 4, 6, 8, 10, 12,
+ 14, 16, 18, 20, 22, 24, 26, 28,
+ 30, 32, 34, 68, 128, 134 };
static const int kScaleFactors[] = { 1, 2, 3, 4 };
for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
for (int h = 0; h < kNumSizesToTest; ++h) {
diff --git a/media/libvpx/libvpx/tools_common.c b/media/libvpx/libvpx/tools_common.c
index 5c13781513..5af971f720 100644
--- a/media/libvpx/libvpx/tools_common.c
+++ b/media/libvpx/libvpx/tools_common.c
@@ -26,15 +26,9 @@
#include "vpx/vpx_codec.h"
-#if defined(_WIN32) || defined(__OS2__)
+#if defined(_WIN32)
#include <io.h>
#include <fcntl.h>
-
-#ifdef __OS2__
-#define _setmode setmode
-#define _fileno fileno
-#define _O_BINARY O_BINARY
-#endif
#endif
#define LOG_ERROR(label) \
@@ -58,7 +52,7 @@ static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
FILE *set_binary_mode(FILE *stream) {
(void)stream;
-#if defined(_WIN32) || defined(__OS2__)
+#if defined(_WIN32)
_setmode(_fileno(stream), _O_BINARY);
#endif
return stream;
@@ -96,9 +90,9 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
int w = vpx_img_plane_width(yuv_frame, plane);
const int h = vpx_img_plane_height(yuv_frame, plane);
int r;
- // Assuming that for nv12 we read all chroma data at one time
+ // Assuming that for nv12 we read all chroma data at once
if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
- // Fixing NV12 chroma width it is odd
+ // Fixing NV12 chroma width if it is odd
if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
/* Determine the correct plane based on the image format. The for-loop
* always counts in Y,U,V order, but this may not match the order of
@@ -229,17 +223,22 @@ int vpx_img_plane_height(const vpx_image_t *img, int plane) {
void vpx_img_write(const vpx_image_t *img, FILE *file) {
int plane;
+ const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
for (plane = 0; plane < 3; ++plane) {
const unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int w = vpx_img_plane_width(img, plane) *
- ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+ int w = vpx_img_plane_width(img, plane);
const int h = vpx_img_plane_height(img, plane);
int y;
+ // Assuming that for nv12 we write all chroma data at once
+ if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+ // Fixing NV12 chroma width if it is odd
+ if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
for (y = 0; y < h; ++y) {
- fwrite(buf, 1, w, file);
+ fwrite(buf, bytespp, w, file);
buf += stride;
}
}
@@ -247,17 +246,22 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) {
int vpx_img_read(vpx_image_t *img, FILE *file) {
int plane;
+ const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
for (plane = 0; plane < 3; ++plane) {
unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int w = vpx_img_plane_width(img, plane) *
- ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+ int w = vpx_img_plane_width(img, plane);
const int h = vpx_img_plane_height(img, plane);
int y;
+ // Assuming that for nv12 we read all chroma data at once
+ if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+ // Fixing NV12 chroma width if it is odd
+ if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
for (y = 0; y < h; ++y) {
- if (fread(buf, 1, w, file) != (size_t)w) return 0;
+ if (fread(buf, bytespp, w, file) != (size_t)w) return 0;
buf += stride;
}
}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
index ee3c281f0f..a54e81084b 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -16,7 +16,7 @@
#include "vpx_ports/mem.h"
static const int8_t vp8_sub_pel_filters[8][8] = {
- { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */
+ { 0, 0, -128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */
{ 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */
{ 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
{ 0, -9, 93, 50, -6, 0, 0, 0 },
diff --git a/media/libvpx/libvpx/vp8/common/entropy.c b/media/libvpx/libvpx/vp8/common/entropy.c
index fc4a3539fd..b9efc0cc1f 100644
--- a/media/libvpx/libvpx/vp8/common/entropy.c
+++ b/media/libvpx/libvpx/vp8/common/entropy.c
@@ -114,7 +114,7 @@ static const vp8_prob Pcat6[] = { 254, 254, 243, 230, 196, 177,
p[0] = p[1] = 0;
}
- void init_bit_trees() {
+ void init_bit_trees(void) {
init_bit_tree(cat1, 1);
init_bit_tree(cat2, 2);
init_bit_tree(cat3, 3);
diff --git a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
index 71529bdfd8..7c8e083f4f 100644
--- a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
+++ b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
@@ -25,23 +25,19 @@
#include "vp8/common/systemdependent.h"
#if CONFIG_MULTITHREAD
-#if HAVE_UNISTD_H && !defined(__OS2__)
+#if HAVE_UNISTD_H
#include <unistd.h>
#elif defined(_WIN32)
#include <windows.h>
typedef void(WINAPI *PGNSI)(LPSYSTEM_INFO);
-#elif defined(__OS2__)
-#define INCL_DOS
-#define INCL_DOSSPINLOCK
-#include <os2.h>
#endif
#endif
#if CONFIG_MULTITHREAD
-static int get_cpu_count() {
+static int get_cpu_count(void) {
int core_count = 16;
-#if HAVE_UNISTD_H && !defined(__OS2__)
+#if HAVE_UNISTD_H
#if defined(_SC_NPROCESSORS_ONLN)
core_count = (int)sysconf(_SC_NPROCESSORS_ONLN);
#elif defined(_SC_NPROC_ONLN)
@@ -49,38 +45,13 @@ static int get_cpu_count() {
#endif
#elif defined(_WIN32)
{
-#if _WIN32_WINNT >= 0x0501
+#if _WIN32_WINNT < 0x0501
+#error _WIN32_WINNT must target Windows XP or newer.
+#endif
SYSTEM_INFO sysinfo;
GetNativeSystemInfo(&sysinfo);
-#else
- PGNSI pGNSI;
- SYSTEM_INFO sysinfo;
-
- /* Call GetNativeSystemInfo if supported or
- * GetSystemInfo otherwise. */
-
- pGNSI = (PGNSI)GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
- "GetNativeSystemInfo");
- if (pGNSI != NULL)
- pGNSI(&sysinfo);
- else
- GetSystemInfo(&sysinfo);
-#endif
-
core_count = (int)sysinfo.dwNumberOfProcessors;
}
-#elif defined(__OS2__)
- {
- ULONG proc_id;
- ULONG status;
-
- core_count = 0;
- for (proc_id = 1;; ++proc_id) {
- if (DosGetProcessorStatus(proc_id, &status)) break;
-
- if (status == PROC_ONLINE) core_count++;
- }
- }
#else
/* other platforms */
#endif
diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h
index 1b70ea5dba..2038c000b0 100644
--- a/media/libvpx/libvpx/vp8/common/onyx.h
+++ b/media/libvpx/libvpx/vp8/common/onyx.h
@@ -242,7 +242,7 @@ typedef struct {
#endif
} VP8_CONFIG;
-void vp8_initialize();
+void vp8_initialize(void);
struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf);
void vp8_remove_compressor(struct VP8_COMP **comp);
diff --git a/media/libvpx/libvpx/vp8/common/rtcd.c b/media/libvpx/libvpx/vp8/common/rtcd.c
index 09a0e2b4b3..102b7ccd54 100644
--- a/media/libvpx/libvpx/vp8/common/rtcd.c
+++ b/media/libvpx/libvpx/vp8/common/rtcd.c
@@ -12,4 +12,4 @@
#include "./vp8_rtcd.h"
#include "vpx_ports/vpx_once.h"
-void vp8_rtcd() { once(setup_rtcd_internal); }
+void vp8_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp8/common/threading.h b/media/libvpx/libvpx/vp8/common/threading.h
index 1cfb9fec51..0de75cfde3 100644
--- a/media/libvpx/libvpx/vp8/common/threading.h
+++ b/media/libvpx/libvpx/vp8/common/threading.h
@@ -19,161 +19,57 @@ extern "C" {
#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
-/* Thread management macros */
#if defined(_WIN32) && !HAVE_PTHREAD_H
/* Win32 */
-#include <process.h>
#include <windows.h>
-#if defined(__GNUC__) && \
- (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define THREAD_FUNCTION \
- __attribute__((force_align_arg_pointer)) unsigned int __stdcall
-#else
-#define THREAD_FUNCTION unsigned int __stdcall
-#endif
-#define THREAD_FUNCTION_RETURN DWORD
-#define THREAD_SPECIFIC_INDEX DWORD
-#define pthread_t HANDLE
-#define pthread_attr_t DWORD
-#define pthread_detach(thread) \
- if (thread != NULL) CloseHandle(thread)
-#define thread_sleep(nms) Sleep(nms)
-#define pthread_cancel(thread) terminate_thread(thread, 0)
-#define ts_key_create(ts_key, destructor) \
- { ts_key = TlsAlloc(); };
-#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
-#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
-#define pthread_self() GetCurrentThreadId()
-
-#elif defined(__OS2__)
-/* OS/2 */
-#define INCL_DOS
-#include <os2.h>
-
-#include <stdlib.h>
-#define THREAD_FUNCTION void *
-#define THREAD_FUNCTION_RETURN void *
-#define THREAD_SPECIFIC_INDEX PULONG
-#define pthread_t TID
-#define pthread_attr_t ULONG
-#define pthread_detach(thread) 0
-#define thread_sleep(nms) DosSleep(nms)
-#define pthread_cancel(thread) DosKillThread(thread)
-#define ts_key_create(ts_key, destructor) \
- DosAllocThreadLocalMemory(1, &(ts_key));
-#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
-#define pthread_setspecific(ts_key, value) (*(ts_key) = (ULONG)(value))
-#define pthread_self() _gettid()
#else
+/* pthreads */
#ifdef __APPLE__
#include <mach/mach_init.h>
#include <mach/semaphore.h>
#include <mach/task.h>
#include <time.h>
#include <unistd.h>
-
#else
#include <semaphore.h>
#endif
-
-#include <pthread.h>
-/* pthreads */
-/* Nearly everything is already defined */
-#define THREAD_FUNCTION void *
-#define THREAD_FUNCTION_RETURN void *
-#define THREAD_SPECIFIC_INDEX pthread_key_t
-#define ts_key_create(ts_key, destructor) \
- pthread_key_create(&(ts_key), destructor);
#endif
/* Synchronization macros: Win32 and Pthreads */
#if defined(_WIN32) && !HAVE_PTHREAD_H
-#define sem_t HANDLE
-#define pause(voidpara) __asm PAUSE
-#define sem_init(sem, sem_attr1, sem_init_value) \
- (int)((*sem = CreateSemaphore(NULL, 0, 32768, NULL)) == NULL)
-#define sem_wait(sem) \
+#define vp8_sem_t HANDLE
+#define vp8_sem_init(sem, pshared, value) \
+ (int)((*sem = CreateSemaphore(NULL, value, 32768, NULL)) == NULL)
+#define vp8_sem_wait(sem) \
(int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem, INFINITE))
-#define sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
-#define sem_destroy(sem) \
+#define vp8_sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
+#define vp8_sem_destroy(sem) \
if (*sem) ((int)(CloseHandle(*sem)) == TRUE)
#define thread_sleep(nms) Sleep(nms)
-#elif defined(__OS2__)
-typedef struct {
- HEV event;
- HMTX wait_mutex;
- HMTX count_mutex;
- int count;
-} sem_t;
-
-static inline int sem_init(sem_t *sem, int pshared, unsigned int value) {
- DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
- value > 0 ? TRUE : FALSE);
- DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
- DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
-
- sem->count = value;
-
- return 0;
-}
-
-static inline int sem_wait(sem_t *sem) {
- DosRequestMutexSem(sem->wait_mutex, -1);
-
- DosWaitEventSem(sem->event, -1);
-
- DosRequestMutexSem(sem->count_mutex, -1);
-
- sem->count--;
- if (sem->count == 0) {
- ULONG post_count;
-
- DosResetEventSem(sem->event, &post_count);
- }
-
- DosReleaseMutexSem(sem->count_mutex);
-
- DosReleaseMutexSem(sem->wait_mutex);
-
- return 0;
-}
-
-static inline int sem_post(sem_t *sem) {
- DosRequestMutexSem(sem->count_mutex, -1);
-
- if (sem->count < 32768) {
- sem->count++;
- DosPostEventSem(sem->event);
- }
-
- DosReleaseMutexSem(sem->count_mutex);
-
- return 0;
-}
-
-static inline int sem_destroy(sem_t *sem) {
- DosCloseEventSem(sem->event);
- DosCloseMutexSem(sem->wait_mutex);
- DosCloseMutexSem(sem->count_mutex);
-
- return 0;
-}
-
-#define thread_sleep(nms) DosSleep(nms)
-
#else
#ifdef __APPLE__
-#define sem_t semaphore_t
-#define sem_init(X, Y, Z) \
- semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
-#define sem_wait(sem) (semaphore_wait(*sem))
-#define sem_post(sem) semaphore_signal(*sem)
-#define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
+#define vp8_sem_t semaphore_t
+#define vp8_sem_init(sem, pshared, value) \
+ semaphore_create(mach_task_self(), sem, SYNC_POLICY_FIFO, value)
+#define vp8_sem_wait(sem) semaphore_wait(*sem)
+#define vp8_sem_post(sem) semaphore_signal(*sem)
+#define vp8_sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
#else
+#include <errno.h>
#include <unistd.h>
#include <sched.h>
+#define vp8_sem_t sem_t
+#define vp8_sem_init sem_init
+static INLINE int vp8_sem_wait(vp8_sem_t *sem) {
+ int ret;
+ while ((ret = sem_wait(sem)) == -1 && errno == EINTR) {
+ }
+ return ret;
+}
+#define vp8_sem_post sem_post
+#define vp8_sem_destroy sem_destroy
#endif /* __APPLE__ */
/* Not Windows. Assume pthreads */
@@ -194,7 +90,6 @@ static inline int sem_destroy(sem_t *sem) {
#define x86_pause_hint()
#endif
-#include "vpx_util/vpx_thread.h"
#include "vpx_util/vpx_atomics.h"
static INLINE void vp8_atomic_spin_wait(
diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
index 2248345ba2..88f2de024b 100644
--- a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
+++ b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
@@ -428,6 +428,7 @@ int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) {
#if CONFIG_MULTITHREAD
if (setjmp(fb->pbi[0]->common.error.jmp)) {
+ fb->pbi[0]->common.error.setjmp = 0;
vp8_remove_decoder_instances(fb);
vp8_zero(fb->pbi);
vpx_clear_system_state();
@@ -452,6 +453,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb) {
/* decoder instance for single thread mode */
remove_decompressor(pbi);
+ fb->pbi[0] = NULL;
return VPX_CODEC_OK;
}
diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
index 1070849620..08a60b31b9 100644
--- a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
+++ b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
@@ -14,6 +14,7 @@
#include <assert.h>
#include "vpx_config.h"
+#include "vpx_util/vpx_pthread.h"
#include "vp8/common/onyxd.h"
#include "treereader.h"
#include "vp8/common/onyxc_int.h"
@@ -94,8 +95,8 @@ typedef struct VP8D_COMP {
DECODETHREAD_DATA *de_thread_data;
pthread_t *h_decoding_thread;
- sem_t *h_event_start_decoding;
- sem_t h_event_end_decoding;
+ vp8_sem_t *h_event_start_decoding;
+ vp8_sem_t h_event_end_decoding;
/* end of threading data */
#endif
diff --git a/media/libvpx/libvpx/vp8/decoder/threading.c b/media/libvpx/libvpx/vp8/decoder/threading.c
index 6ccb080cf9..d16284d134 100644
--- a/media/libvpx/libvpx/vp8/decoder/threading.c
+++ b/media/libvpx/libvpx/vp8/decoder/threading.c
@@ -15,6 +15,7 @@
#endif
#include "onyxd_int.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
#include "vp8/common/common.h"
#include "vp8/common/threading.h"
#include "vp8/common/loopfilter.h"
@@ -577,10 +578,10 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
/* signal end of decoding of current thread for current frame */
if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows)
- sem_post(&pbi->h_event_end_decoding);
+ vp8_sem_post(&pbi->h_event_end_decoding);
}
-static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
+static THREADFN thread_decoding_proc(void *p_data) {
int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
@@ -589,7 +590,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
while (1) {
if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break;
- if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
+ if (vp8_sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) {
break;
} else {
@@ -598,16 +599,17 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
if (setjmp(xd->error_info.jmp)) {
xd->error_info.setjmp = 0;
// Signal the end of decoding for current thread.
- sem_post(&pbi->h_event_end_decoding);
+ vp8_sem_post(&pbi->h_event_end_decoding);
continue;
}
xd->error_info.setjmp = 1;
mt_decode_mb_rows(pbi, xd, ithread + 1);
+ xd->error_info.setjmp = 0;
}
}
}
- return 0;
+ return THREAD_EXIT_SUCCESS;
}
void vp8_decoder_create_threads(VP8D_COMP *pbi) {
@@ -634,13 +636,13 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
- if (sem_init(&pbi->h_event_end_decoding, 0, 0)) {
+ if (vp8_sem_init(&pbi->h_event_end_decoding, 0, 0)) {
vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
"Failed to initialize semaphore");
}
for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) {
- if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
+ if (vp8_sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
@@ -650,7 +652,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
if (pthread_create(&pbi->h_decoding_thread[ithread], 0,
thread_decoding_proc, &pbi->de_thread_data[ithread])) {
- sem_destroy(&pbi->h_event_start_decoding[ithread]);
+ vp8_sem_destroy(&pbi->h_event_start_decoding[ithread]);
break;
}
}
@@ -661,7 +663,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
/* the remainder of cleanup cases will be handled in
* vp8_decoder_remove_threads(). */
if (pbi->allocated_decoding_thread_count == 0) {
- sem_destroy(&pbi->h_event_end_decoding);
+ vp8_sem_destroy(&pbi->h_event_end_decoding);
}
vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
"Failed to create threads");
@@ -812,16 +814,16 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
/* allow all threads to exit */
for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
- sem_post(&pbi->h_event_start_decoding[i]);
+ vp8_sem_post(&pbi->h_event_start_decoding[i]);
pthread_join(pbi->h_decoding_thread[i], NULL);
}
for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
- sem_destroy(&pbi->h_event_start_decoding[i]);
+ vp8_sem_destroy(&pbi->h_event_start_decoding[i]);
}
if (pbi->allocated_decoding_thread_count) {
- sem_destroy(&pbi->h_event_end_decoding);
+ vp8_sem_destroy(&pbi->h_event_end_decoding);
}
vpx_free(pbi->h_decoding_thread);
@@ -883,7 +885,7 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
pbi->decoding_thread_count);
for (i = 0; i < pbi->decoding_thread_count; ++i) {
- sem_post(&pbi->h_event_start_decoding[i]);
+ vp8_sem_post(&pbi->h_event_start_decoding[i]);
}
if (setjmp(xd->error_info.jmp)) {
@@ -893,15 +895,16 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
// the current frame while the main thread starts decoding the next frame,
// which causes a data race.
for (i = 0; i < pbi->decoding_thread_count; ++i)
- sem_wait(&pbi->h_event_end_decoding);
+ vp8_sem_wait(&pbi->h_event_end_decoding);
return -1;
}
xd->error_info.setjmp = 1;
mt_decode_mb_rows(pbi, xd, 0);
+ xd->error_info.setjmp = 0;
for (i = 0; i < pbi->decoding_thread_count + 1; ++i)
- sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
+ vp8_sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
return 0;
}
diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.c b/media/libvpx/libvpx/vp8/encoder/encodeframe.c
index 82c48b13a7..d0117897db 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodeframe.c
+++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.c
@@ -7,38 +7,38 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <stdio.h>
#include <limits.h>
+#include <stdio.h>
#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "bitstream.h"
-#include "encodemb.h"
-#include "encodemv.h"
-#if CONFIG_MULTITHREAD
-#include "ethreading.h"
-#endif
+
#include "vp8/common/common.h"
-#include "onyx_int.h"
-#include "vp8/common/extend.h"
#include "vp8/common/entropymode.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vp8/common/setupintrarecon.h"
-#include "encodeintra.h"
-#include "vp8/common/reconinter.h"
-#include "rdopt.h"
-#include "pickinter.h"
+#include "vp8/common/extend.h"
#include "vp8/common/findnearmv.h"
#include "vp8/common/invtrans.h"
+#include "vp8/common/quant_common.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/setupintrarecon.h"
+#include "vp8/common/threading.h"
+#include "vp8/encoder/bitstream.h"
+#include "vp8/encoder/encodeframe.h"
+#include "vp8/encoder/encodeintra.h"
+#include "vp8/encoder/encodemb.h"
+#include "vp8/encoder/encodemv.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/pickinter.h"
+#include "vp8/encoder/rdopt.h"
+#include "vp8/encoder/segmentation.h"
+#include "vp8_rtcd.h"
#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/vpx_timer.h"
-#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-#include "bitstream.h"
+
+#if CONFIG_MULTITHREAD
+#include "vp8/encoder/ethreading.h"
#endif
-#include "encodeframe.h"
extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
@@ -773,7 +773,7 @@ void vp8_encode_frame(VP8_COMP *cpi) {
vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
for (i = 0; i < cpi->encoding_thread_count; ++i) {
- sem_post(&cpi->h_event_start_encoding[i]);
+ vp8_sem_post(&cpi->h_event_start_encoding[i]);
}
for (mb_row = 0; mb_row < cm->mb_rows;
@@ -806,7 +806,7 @@ void vp8_encode_frame(VP8_COMP *cpi) {
}
/* Wait for all the threads to finish. */
for (i = 0; i < cpi->encoding_thread_count; ++i) {
- sem_wait(&cpi->h_event_end_encoding[i]);
+ vp8_sem_wait(&cpi->h_event_end_encoding[i]);
}
for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.c b/media/libvpx/libvpx/vp8/encoder/ethreading.c
index e2f8b89d46..98c87d3cbc 100644
--- a/media/libvpx/libvpx/vp8/encoder/ethreading.c
+++ b/media/libvpx/libvpx/vp8/encoder/ethreading.c
@@ -10,6 +10,7 @@
#include <stddef.h>
#include "onyx_int.h"
+#include "vpx_util/vpx_pthread.h"
#include "vp8/common/threading.h"
#include "vp8/common/common.h"
#include "vp8/common/extend.h"
@@ -22,27 +23,27 @@
extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x,
int ok_to_skip);
-static THREAD_FUNCTION thread_loopfilter(void *p_data) {
+static THREADFN thread_loopfilter(void *p_data) {
VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
VP8_COMMON *cm = &cpi->common;
while (1) {
if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
- if (sem_wait(&cpi->h_event_start_lpf) == 0) {
+ if (vp8_sem_wait(&cpi->h_event_start_lpf) == 0) {
/* we're shutting down */
if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
vp8_loopfilter_frame(cpi, cm);
- sem_post(&cpi->h_event_end_lpf);
+ vp8_sem_post(&cpi->h_event_end_lpf);
}
}
- return 0;
+ return THREAD_EXIT_SUCCESS;
}
-static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
+static THREADFN thread_encoding_proc(void *p_data) {
int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
@@ -51,7 +52,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
while (1) {
if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
- if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
+ if (vp8_sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
const int nsync = cpi->mt_sync_range;
VP8_COMMON *cm = &cpi->common;
int mb_row;
@@ -307,12 +308,12 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count;
}
/* Signal that this thread has completed processing its rows. */
- sem_post(&cpi->h_event_end_encoding[ithread]);
+ vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
}
}
/* printf("exit thread %d\n", ithread); */
- return 0;
+ return THREAD_EXIT_SUCCESS;
}
static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) {
@@ -514,9 +515,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
vpx_malloc(sizeof(pthread_t) * th_count));
CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding,
- vpx_malloc(sizeof(sem_t) * th_count));
+ vpx_malloc(sizeof(vp8_sem_t) * th_count));
CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding,
- vpx_malloc(sizeof(sem_t) * th_count));
+ vpx_malloc(sizeof(vp8_sem_t) * th_count));
CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei,
vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
@@ -538,8 +539,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb);
vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd);
- sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
- sem_init(&cpi->h_event_end_encoding[ithread], 0, 0);
+ vp8_sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+ vp8_sem_init(&cpi->h_event_end_encoding[ithread], 0, 0);
ethd->ithread = ithread;
ethd->ptr1 = (void *)cpi;
@@ -554,11 +555,11 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
/* shutdown other threads */
vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
for (--ithread; ithread >= 0; ithread--) {
- sem_post(&cpi->h_event_start_encoding[ithread]);
- sem_post(&cpi->h_event_end_encoding[ithread]);
+ vp8_sem_post(&cpi->h_event_start_encoding[ithread]);
+ vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
pthread_join(cpi->h_encoding_thread[ithread], 0);
- sem_destroy(&cpi->h_event_start_encoding[ithread]);
- sem_destroy(&cpi->h_event_end_encoding[ithread]);
+ vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]);
+ vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]);
}
/* free thread related resources */
@@ -580,8 +581,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
{
LPFTHREAD_DATA *lpfthd = &cpi->lpf_thread_data;
- sem_init(&cpi->h_event_start_lpf, 0, 0);
- sem_init(&cpi->h_event_end_lpf, 0, 0);
+ vp8_sem_init(&cpi->h_event_start_lpf, 0, 0);
+ vp8_sem_init(&cpi->h_event_end_lpf, 0, 0);
lpfthd->ptr1 = (void *)cpi;
rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter, lpfthd);
@@ -590,14 +591,14 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
/* shutdown other threads */
vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
for (--ithread; ithread >= 0; ithread--) {
- sem_post(&cpi->h_event_start_encoding[ithread]);
- sem_post(&cpi->h_event_end_encoding[ithread]);
+ vp8_sem_post(&cpi->h_event_start_encoding[ithread]);
+ vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
pthread_join(cpi->h_encoding_thread[ithread], 0);
- sem_destroy(&cpi->h_event_start_encoding[ithread]);
- sem_destroy(&cpi->h_event_end_encoding[ithread]);
+ vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]);
+ vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]);
}
- sem_destroy(&cpi->h_event_end_lpf);
- sem_destroy(&cpi->h_event_start_lpf);
+ vp8_sem_destroy(&cpi->h_event_end_lpf);
+ vp8_sem_destroy(&cpi->h_event_start_lpf);
/* free thread related resources */
vpx_free(cpi->h_event_start_encoding);
@@ -627,21 +628,21 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
int i;
for (i = 0; i < cpi->encoding_thread_count; ++i) {
- sem_post(&cpi->h_event_start_encoding[i]);
- sem_post(&cpi->h_event_end_encoding[i]);
+ vp8_sem_post(&cpi->h_event_start_encoding[i]);
+ vp8_sem_post(&cpi->h_event_end_encoding[i]);
pthread_join(cpi->h_encoding_thread[i], 0);
- sem_destroy(&cpi->h_event_start_encoding[i]);
- sem_destroy(&cpi->h_event_end_encoding[i]);
+ vp8_sem_destroy(&cpi->h_event_start_encoding[i]);
+ vp8_sem_destroy(&cpi->h_event_end_encoding[i]);
}
- sem_post(&cpi->h_event_start_lpf);
+ vp8_sem_post(&cpi->h_event_start_lpf);
pthread_join(cpi->h_filter_thread, 0);
}
- sem_destroy(&cpi->h_event_end_lpf);
- sem_destroy(&cpi->h_event_start_lpf);
+ vp8_sem_destroy(&cpi->h_event_end_lpf);
+ vp8_sem_destroy(&cpi->h_event_start_lpf);
cpi->b_lpf_running = 0;
/* free thread related resources */
diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
index 4e128e3c49..ad01c6fc86 100644
--- a/media/libvpx/libvpx/vp8/encoder/onyx_if.c
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
@@ -63,7 +63,7 @@
extern int vp8_update_coef_context(VP8_COMP *cpi);
#endif
-extern unsigned int vp8_get_processor_freq();
+extern unsigned int vp8_get_processor_freq(void);
int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
@@ -267,7 +267,11 @@ static int rescale(int val, int num, int denom) {
int64_t llden = denom;
int64_t llval = val;
- return (int)(llval * llnum / llden);
+ int64_t result = (llval * llnum / llden);
+ if (result <= INT_MAX)
+ return (int)result;
+ else
+ return INT_MAX;
}
void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
@@ -276,7 +280,10 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
LAYER_CONTEXT *lc = &cpi->layer_context[layer];
lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
- lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
+ if (cpi->oxcf.target_bitrate[layer] > INT_MAX / 1000)
+ lc->target_bandwidth = INT_MAX;
+ else
+ lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
lc->optimal_buffer_level_in_ms = oxcf->optimal_buffer_level;
@@ -1381,7 +1388,10 @@ void vp8_update_layer_contexts(VP8_COMP *cpi) {
LAYER_CONTEXT *lc = &cpi->layer_context[i];
lc->framerate = cpi->ref_framerate / oxcf->rate_decimator[i];
- lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
+ if (oxcf->target_bitrate[i] > INT_MAX / 1000)
+ lc->target_bandwidth = INT_MAX;
+ else
+ lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
lc->starting_buffer_level = rescale(
(int)oxcf->starting_buffer_level_in_ms, lc->target_bandwidth, 1000);
@@ -1995,6 +2005,7 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
#if CONFIG_MULTITHREAD
if (vp8cx_create_encoder_threads(cpi)) {
+ cpi->common.error.setjmp = 0;
vp8_remove_compressor(&cpi);
return 0;
}
@@ -2048,8 +2059,6 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
vp8_loop_filter_init(cm);
- cpi->common.error.setjmp = 0;
-
#if CONFIG_MULTI_RES_ENCODING
/* Calculate # of MBs in a row in lower-resolution level image. */
@@ -2076,6 +2085,8 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
vp8_setup_block_ptrs(&cpi->mb);
vp8_setup_block_dptrs(&cpi->mb.e_mbd);
+ cpi->common.error.setjmp = 0;
+
return cpi;
}
@@ -3172,7 +3183,8 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
#if CONFIG_MULTITHREAD
if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
- sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+ /* signal that we have set filter_level */
+ vp8_sem_post(&cpi->h_event_end_lpf);
}
#endif
@@ -4387,11 +4399,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
#if CONFIG_MULTITHREAD
if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
/* start loopfilter in separate thread */
- sem_post(&cpi->h_event_start_lpf);
+ vp8_sem_post(&cpi->h_event_start_lpf);
cpi->b_lpf_running = 1;
/* wait for the filter_level to be picked so that we can continue with
* stream packing */
- sem_wait(&cpi->h_event_end_lpf);
+ vp8_sem_wait(&cpi->h_event_end_lpf);
} else
#endif
{
@@ -5120,6 +5132,14 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
vpx_usec_timer_mark(&cmptimer);
cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+#if CONFIG_MULTITHREAD
+ /* wait for the lpf thread done */
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
+ vp8_sem_wait(&cpi->h_event_end_lpf);
+ cpi->b_lpf_running = 0;
+ }
+#endif
+
if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {
generate_psnr_packet(cpi);
}
@@ -5247,16 +5267,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
#endif
#endif
- cpi->common.error.setjmp = 0;
-
-#if CONFIG_MULTITHREAD
- /* wait for the lpf thread done */
- if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
- sem_wait(&cpi->h_event_end_lpf);
- cpi->b_lpf_running = 0;
- }
-#endif
-
return 0;
}
diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_int.h b/media/libvpx/libvpx/vp8/encoder/onyx_int.h
index 1451a27812..bb1518ed7f 100644
--- a/media/libvpx/libvpx/vp8/encoder/onyx_int.h
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_int.h
@@ -20,6 +20,7 @@
#include "tokenize.h"
#include "vp8/common/onyxc_int.h"
#include "vpx_dsp/variance.h"
+#include "vpx_util/vpx_pthread.h"
#include "encodemb.h"
#include "vp8/encoder/quantize.h"
#include "vp8/common/entropy.h"
@@ -540,10 +541,10 @@ typedef struct VP8_COMP {
LPFTHREAD_DATA lpf_thread_data;
/* events */
- sem_t *h_event_start_encoding;
- sem_t *h_event_end_encoding;
- sem_t h_event_start_lpf;
- sem_t h_event_end_lpf;
+ vp8_sem_t *h_event_start_encoding;
+ vp8_sem_t *h_event_end_encoding;
+ vp8_sem_t h_event_start_lpf;
+ vp8_sem_t h_event_end_lpf;
#endif
TOKENLIST *tplist;
diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.c b/media/libvpx/libvpx/vp8/encoder/ratectrl.c
index fcd4eb04eb..7ba7a308ab 100644
--- a/media/libvpx/libvpx/vp8/encoder/ratectrl.c
+++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.c
@@ -791,8 +791,12 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
(int)((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) /
one_percent_bits);
} else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) {
- percent_high =
- (int)((100 * cpi->bits_off_target) / (cpi->total_byte_count * 8));
+ if (cpi->total_byte_count > 0) {
+ percent_high = (int)((100 * cpi->bits_off_target) /
+ (cpi->total_byte_count * 8));
+ } else {
+ percent_high = cpi->oxcf.over_shoot_pct;
+ }
}
if (percent_high > cpi->oxcf.over_shoot_pct) {
@@ -1190,10 +1194,13 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
/* Calculate required scaling factor based on target frame size and
* size of frame produced using previous Q
*/
- if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) {
- /* Case where we would overflow int */
- target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs)
- << BPER_MB_NORMBITS;
+ if (target_bits_per_frame > (INT_MAX >> BPER_MB_NORMBITS)) {
+ int temp = target_bits_per_frame / cpi->common.MBs;
+ if (temp > (INT_MAX >> BPER_MB_NORMBITS)) {
+ target_bits_per_mb = INT_MAX;
+ } else {
+ target_bits_per_mb = temp << BPER_MB_NORMBITS;
+ }
} else {
target_bits_per_mb =
(target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
@@ -1534,9 +1541,13 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
// undershoots significantly, and then we end up dropping every other
// frame because the QP/rate_correction_factor may have been too low
// before the drop and then takes too long to come up.
- if (target_size >= (INT_MAX >> BPER_MB_NORMBITS)) {
- target_bits_per_mb = (target_size / cpi->common.MBs)
- << BPER_MB_NORMBITS;
+ if (target_size > (INT_MAX >> BPER_MB_NORMBITS)) {
+ int temp = target_size / cpi->common.MBs;
+ if (temp > (INT_MAX >> BPER_MB_NORMBITS)) {
+ target_bits_per_mb = INT_MAX;
+ } else {
+ target_bits_per_mb = temp << BPER_MB_NORMBITS;
+ }
} else {
target_bits_per_mb =
(target_size << BPER_MB_NORMBITS) / cpi->common.MBs;
diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.h b/media/libvpx/libvpx/vp8/encoder/tokenize.h
index 47b5be17f1..5223aa2d86 100644
--- a/media/libvpx/libvpx/vp8/encoder/tokenize.h
+++ b/media/libvpx/libvpx/vp8/encoder/tokenize.h
@@ -18,8 +18,6 @@
extern "C" {
#endif
-void vp8_tokenize_initialize();
-
typedef struct {
short Token;
short Extra;
diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
index 1f16cc53d3..2b238c1a97 100644
--- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
@@ -8,6 +8,11 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <limits.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
#include "./vpx_config.h"
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
@@ -18,6 +23,7 @@
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/static_assert.h"
#include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_thread.h"
#include "vpx_util/vpx_timestamp.h"
#if CONFIG_MULTITHREAD
#include "vp8/encoder/ethreading.h"
@@ -27,8 +33,6 @@
#include "vp8/encoder/firstpass.h"
#include "vp8/common/onyx.h"
#include "vp8/common/common.h"
-#include <stdlib.h>
-#include <string.h>
struct vp8_extracfg {
struct vpx_codec_pkt_list *pkt_list;
@@ -148,7 +152,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(cfg, g_profile, 3);
RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
- RANGE_CHECK_HI(cfg, g_threads, 64);
+ RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
#if CONFIG_REALTIME_ONLY
RANGE_CHECK_HI(cfg, g_lag_in_frames, 0);
#elif CONFIG_MULTI_RES_ENCODING
@@ -495,7 +499,10 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
vp8_change_config(ctx->cpi, &ctx->oxcf);
#if CONFIG_MULTITHREAD
- if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR;
+ if (vp8cx_create_encoder_threads(ctx->cpi)) {
+ ctx->cpi->common.error.setjmp = 0;
+ return VPX_CODEC_ERROR;
+ }
#endif
ctx->cpi->common.error.setjmp = 0;
return VPX_CODEC_OK;
@@ -777,9 +784,9 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
return res;
}
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
- unsigned long duration,
- vpx_enc_deadline_t deadline) {
+static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+ unsigned long duration,
+ vpx_enc_deadline_t deadline) {
int new_qc;
#if !(CONFIG_REALTIME_ONLY)
@@ -788,13 +795,15 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
if (deadline) {
/* Convert duration parameter from stream timebase to microseconds */
- uint64_t duration_us;
-
VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 &&
(TICKS_PER_SEC % 1000000) == 0);
- duration_us = duration * (uint64_t)ctx->timestamp_ratio.num /
- (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
+ if (duration > UINT64_MAX / (uint64_t)ctx->timestamp_ratio.num) {
+ ERROR("duration is too big");
+ }
+ uint64_t duration_us =
+ duration * (uint64_t)ctx->timestamp_ratio.num /
+ ((uint64_t)ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
/* If the deadline is more that the duration this frame is to be shown,
* use good quality mode. Otherwise use realtime mode.
@@ -820,6 +829,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
ctx->oxcf.Mode = new_qc;
vp8_change_config(ctx->cpi, &ctx->oxcf);
}
+ return VPX_CODEC_OK;
}
static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
@@ -894,13 +904,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
- if (!ctx->pts_offset_initialized) {
- ctx->pts_offset = pts_val;
- ctx->pts_offset_initialized = 1;
- }
- pts_val -= ctx->pts_offset;
-
- pick_quickcompress_mode(ctx, duration, deadline);
+ if (!res) res = pick_quickcompress_mode(ctx, duration, deadline);
vpx_codec_pkt_list_init(&ctx->pkt_list);
// If no flags are set in the encode call, then use the frame flags as
@@ -924,7 +928,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
/* Initialize the encoder instance on the first frame*/
if (!res && ctx->cpi) {
unsigned int lib_flags;
- YV12_BUFFER_CONFIG sd;
int64_t dst_time_stamp, dst_end_time_stamp;
size_t size, cx_data_sz;
unsigned char *cx_data;
@@ -951,12 +954,44 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
/* Convert API flags to internal codec lib flags */
lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
- dst_time_stamp =
- pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
- dst_end_time_stamp = (pts_val + (int64_t)duration) *
- ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
-
if (img != NULL) {
+ YV12_BUFFER_CONFIG sd;
+
+ if (!ctx->pts_offset_initialized) {
+ ctx->pts_offset = pts_val;
+ ctx->pts_offset_initialized = 1;
+ }
+ if (pts_val < ctx->pts_offset) {
+ vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "pts is smaller than initial pts");
+ }
+ pts_val -= ctx->pts_offset;
+ if (pts_val > INT64_MAX / ctx->timestamp_ratio.num) {
+ vpx_internal_error(
+ &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "conversion of relative pts to ticks would overflow");
+ }
+ dst_time_stamp =
+ pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+#if ULONG_MAX > INT64_MAX
+ if (duration > INT64_MAX) {
+ vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "duration is too big");
+ }
+#endif
+ if (pts_val > INT64_MAX - (int64_t)duration) {
+ vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "relative pts + duration is too big");
+ }
+ vpx_codec_pts_t pts_end = pts_val + (int64_t)duration;
+ if (pts_end > INT64_MAX / ctx->timestamp_ratio.num) {
+ vpx_internal_error(
+ &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "conversion of relative pts + duration to ticks would overflow");
+ }
+ dst_end_time_stamp =
+ pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+
res = image2yuvconfig(img, &sd);
if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
@@ -989,6 +1024,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
&dst_end_time_stamp, !img);
if (comp_data_state == VPX_CODEC_CORRUPT_FRAME) {
+ ctx->cpi->common.error.setjmp = 0;
return VPX_CODEC_CORRUPT_FRAME;
} else if (comp_data_state == -1) {
break;
diff --git a/media/libvpx/libvpx/vp8/vp8_dx_iface.c b/media/libvpx/libvpx/vp8/vp8_dx_iface.c
index e81deaf4ea..fa7d7be403 100644
--- a/media/libvpx/libvpx/vp8/vp8_dx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_dx_iface.c
@@ -488,7 +488,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) {
pc->fb_idx_ref_cnt[pc->new_fb_idx]--;
}
- pc->error.setjmp = 0;
+ pbi->common.error.setjmp = 0;
#if CONFIG_MULTITHREAD
if (pbi->restart_threads) {
ctx->si.w = 0;
diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
index 261c316fd1..312092f190 100644
--- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
@@ -8,10 +8,13 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "vp8/vp8_ratectrl_rtc.h"
+
#include <math.h>
+
#include <new>
+
#include "vp8/common/common.h"
-#include "vp8/vp8_ratectrl_rtc.h"
#include "vp8/encoder/onyx_int.h"
#include "vp8/encoder/ratectrl.h"
#include "vpx_ports/system_state.h"
@@ -311,6 +314,14 @@ FrameDropDecision VP8RateControlRTC::ComputeQP(
int VP8RateControlRTC::GetQP() const { return q_; }
+UVDeltaQP VP8RateControlRTC::GetUVDeltaQP() const {
+ VP8_COMMON *cm = &cpi_->common;
+ UVDeltaQP uv_delta_q;
+ uv_delta_q.uvdc_delta_q = cm->uvdc_delta_q;
+ uv_delta_q.uvac_delta_q = cm->uvac_delta_q;
+ return uv_delta_q;
+}
+
int VP8RateControlRTC::GetLoopfilterLevel() const {
VP8_COMMON *cm = &cpi_->common;
const double qp = q_;
diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
index 59fb607526..b458b5ce65 100644
--- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
@@ -21,7 +21,6 @@ struct VP8_COMP;
namespace libvpx {
struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
- public:
VP8RateControlRtcConfig() {
memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
@@ -42,6 +41,9 @@ class VP8RateControlRTC {
bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
// GetQP() needs to be called after ComputeQP() to get the latest QP
int GetQP() const;
+ // GetUVDeltaQP() needs to be called after ComputeQP() to get the latest
+ // delta QP for UV.
+ UVDeltaQP GetUVDeltaQP() const;
// GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter
// level is calculated from frame qp.
int GetLoopfilterLevel() const;
@@ -53,10 +55,10 @@ class VP8RateControlRTC {
void PostEncodeUpdate(uint64_t encoded_frame_size);
private:
- VP8RateControlRTC() {}
+ VP8RateControlRTC() = default;
bool InitRateControl(const VP8RateControlRtcConfig &cfg);
- struct VP8_COMP *cpi_;
- int q_;
+ struct VP8_COMP *cpi_ = nullptr;
+ int q_ = -1;
};
} // namespace libvpx
diff --git a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
index 1cfc12f6fa..4c8fcf6989 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
@@ -13,7 +13,6 @@
#include "./vpx_config.h"
#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_util/vpx_thread.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_loopfilter.h"
diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
index 37762ca15a..1a93b97e56 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
@@ -12,4 +12,4 @@
#include "./vp9_rtcd.h"
#include "vpx_ports/vpx_once.h"
-void vp9_rtcd() { once(setup_rtcd_internal); }
+void vp9_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
index 3ecbd5417f..af3ff0e980 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -129,7 +129,7 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
-specialize qw/vp9_block_error_fp neon avx2 sse2/;
+specialize qw/vp9_block_error_fp neon sve avx2 sse2/;
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
@@ -138,12 +138,12 @@ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t
specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- specialize qw/vp9_block_error neon avx2 sse2/;
+ specialize qw/vp9_block_error neon sve avx2 sse2/;
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
specialize qw/vp9_highbd_block_error neon sse2/;
} else {
- specialize qw/vp9_block_error neon avx2 msa sse2/;
+ specialize qw/vp9_block_error neon sve avx2 msa sse2/;
}
# fdct functions
diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
index 8df18af3b8..24adbcbff0 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_thread_common.h"
#include "vp9/common/vp9_reconinter.h"
diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
index 5df0117f12..96c705d0d5 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
@@ -12,6 +12,7 @@
#define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_
#include "./vpx_config.h"
#include "vp9/common/vp9_loopfilter.h"
+#include "vpx_util/vpx_pthread.h"
#include "vpx_util/vpx_thread.h"
#ifdef __cplusplus
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
index c5892156f4..4fe680cefc 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -22,6 +22,7 @@
#include "vpx_ports/mem.h"
#include "vpx_ports/mem_ops.h"
#include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_pthread.h"
#include "vpx_util/vpx_thread.h"
#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
#include "vpx_util/vpx_debug_util.h"
@@ -2292,6 +2293,7 @@ static INLINE void init_mt(VP9Decoder *pbi) {
++pbi->num_tile_workers;
winterface->init(worker);
+ worker->thread_name = "vpx tile worker";
if (n < num_threads - 1 && !winterface->reset(worker)) {
do {
winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]);
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
index 5a7e9f9ab3..5c77df5002 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
@@ -21,6 +21,7 @@
#include "vpx_ports/vpx_once.h"
#include "vpx_ports/vpx_timer.h"
#include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_pthread.h"
#include "vpx_util/vpx_thread.h"
#include "vp9/common/vp9_alloccommon.h"
@@ -210,6 +211,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
cm->error.setjmp = 0;
vpx_get_worker_interface()->init(&pbi->lf_worker);
+ pbi->lf_worker.thread_name = "vpx lf worker";
return pbi;
}
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
index 2e198d552e..b3ee4eab5f 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
@@ -16,6 +16,7 @@
#include "vpx/vpx_codec.h"
#include "vpx_dsp/bitreader.h"
#include "vpx_scale/yv12config.h"
+#include "vpx_util/vpx_pthread.h"
#include "vpx_util/vpx_thread.h"
#include "vp9/common/vp9_thread_common.h"
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
index 9a31f5a6d0..926ae87739 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
@@ -12,6 +12,7 @@
#include <string.h>
#include "vpx/vpx_integer.h"
+#include "vpx_util/vpx_pthread.h"
#include "vp9/decoder/vp9_job_queue.h"
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
index bc23bf9c2c..59f71fb9ba 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
@@ -11,7 +11,7 @@
#ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
#define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
-#include "vpx_util/vpx_thread.h"
+#include "vpx_util/vpx_pthread.h"
typedef struct {
// Pointer to buffer base which contains the jobs
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
new file mode 100644
index 0000000000..78e7361d85
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ int64x2_t err_v = vdupq_n_s64(0);
+ int64x2_t ssz_v = vdupq_n_s64(0);
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const int16x8_t diff0 = vabdq_s16(c0, d0);
+ const int16x8_t diff1 = vabdq_s16(c1, d1);
+
+ err_v = vpx_dotq_s16(err_v, diff0, diff0);
+ err_v = vpx_dotq_s16(err_v, diff1, diff1);
+
+ ssz_v = vpx_dotq_s16(ssz_v, c0, c0);
+ ssz_v = vpx_dotq_s16(ssz_v, c1, c1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ *ssz = horizontal_add_int64x2(ssz_v);
+ return horizontal_add_int64x2(err_v);
+}
+
+int64_t vp9_block_error_fp_sve(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, int block_size) {
+ int64x2_t err = vdupq_n_s64(0);
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const int16x8_t diff0 = vabdq_s16(c0, d0);
+ const int16x8_t diff1 = vabdq_s16(c1, d1);
+
+ err = vpx_dotq_s16(err, diff0, diff0);
+ err = vpx_dotq_s16(err, diff1, diff1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ return horizontal_add_int64x2(err);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
index 7fa00cd194..6542794667 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_block.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
@@ -11,8 +11,6 @@
#ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_
#define VPX_VP9_ENCODER_VP9_BLOCK_H_
-#include "vpx_util/vpx_thread.h"
-
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_entropy.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
index 42073f756c..ee0fcd8729 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
@@ -119,8 +119,8 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
PC_TREE *const tree = &td->pc_tree[pc_tree_index];
tree->block_size = square[0];
alloc_tree_contexts(cm, tree, 4);
- tree->leaf_split[0] = this_leaf++;
- for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
+ tree->u.leaf_split[0] = this_leaf++;
+ for (j = 1; j < 4; j++) tree->u.leaf_split[j] = tree->u.leaf_split[0];
}
// Each node has 4 leaf nodes, fill each block_size level of the tree
@@ -130,7 +130,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
PC_TREE *const tree = &td->pc_tree[pc_tree_index];
alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
tree->block_size = square[square_index];
- for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+ for (j = 0; j < 4; j++) tree->u.split[j] = this_pc++;
++pc_tree_index;
}
++square_index;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
index 4e301cc17d..51e13ba654 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
@@ -90,7 +90,7 @@ typedef struct PC_TREE {
union {
struct PC_TREE *split[4];
PICK_MODE_CONTEXT *leaf_split[4];
- };
+ } u;
// Obtained from a simple motion search. Used by the ML based partition search
// speed feature.
MV mv;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
index 46291f4868..b24c85f406 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -21,7 +21,7 @@
#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_timer.h"
#include "vpx_ports/system_state.h"
-
+#include "vpx_util/vpx_pthread.h"
#if CONFIG_MISMATCH_DEBUG
#include "vpx_util/vpx_debug_util.h"
#endif // CONFIG_MISMATCH_DEBUG
@@ -2303,16 +2303,16 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
assert(partition == PARTITION_SPLIT);
if (bsize == BLOCK_8X8) {
encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
- pc_tree->leaf_split[0]);
+ pc_tree->u.leaf_split[0]);
} else {
encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
- pc_tree->split[0]);
+ pc_tree->u.split[0]);
encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
- subsize, pc_tree->split[1]);
+ subsize, pc_tree->u.split[1]);
encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
- subsize, pc_tree->split[2]);
+ subsize, pc_tree->u.split[2]);
encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
- subsize, pc_tree->split[3]);
+ subsize, pc_tree->u.split[3]);
}
break;
}
@@ -2645,13 +2645,13 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
assert(partition == PARTITION_SPLIT);
subsize = get_subsize(bsize, PARTITION_SPLIT);
encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
- pc_tree->split[0]);
+ pc_tree->u.split[0]);
encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
- subsize, pc_tree->split[1]);
+ subsize, pc_tree->u.split[1]);
encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
- subsize, pc_tree->split[2]);
+ subsize, pc_tree->u.split[2]);
encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
- output_enabled, subsize, pc_tree->split[3]);
+ output_enabled, subsize, pc_tree->u.split[3]);
break;
}
@@ -2801,7 +2801,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
assert(partition == PARTITION_SPLIT);
if (bsize == BLOCK_8X8) {
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
- subsize, pc_tree->leaf_split[0], INT_MAX, INT64_MAX);
+ subsize, pc_tree->u.leaf_split[0], INT_MAX, INT64_MAX);
break;
}
last_part_rdc.rate = 0;
@@ -2819,7 +2819,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss,
tp, mi_row + y_idx, mi_col + x_idx, subsize,
&tmp_rdc.rate, &tmp_rdc.dist, i != 3,
- pc_tree->split[i]);
+ pc_tree->u.split[i]);
if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
vp9_rd_cost_reset(&last_part_rdc);
break;
@@ -2860,9 +2860,9 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
continue;
save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
- pc_tree->split[i]->partitioning = PARTITION_NONE;
+ pc_tree->u.split[i]->partitioning = PARTITION_NONE;
rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
- &tmp_rdc, split_subsize, &pc_tree->split[i]->none,
+ &tmp_rdc, split_subsize, &pc_tree->u.split[i]->none,
INT_MAX, INT64_MAX);
restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2877,7 +2877,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
if (i != 3)
encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
- split_subsize, pc_tree->split[i]);
+ split_subsize, pc_tree->u.split[i]);
pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
split_subsize);
@@ -3391,7 +3391,7 @@ static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x,
features[feature_index++] = VPXMIN(rd_ratio, 2.0f);
for (i = 0; i < 4; ++i) {
- const int64_t this_rd = pc_tree->split[i]->none.rdcost;
+ const int64_t this_rd = pc_tree->u.split[i]->none.rdcost;
const int rd_valid = this_rd > 0 && this_rd < 1000000000;
// Ratio between sub-block RD and whole block RD.
features[feature_index++] =
@@ -3958,19 +3958,19 @@ static void store_superblock_info(
}
// recursively traverse partition tree when partition is split.
assert(pc_tree->partitioning == PARTITION_SPLIT);
- store_superblock_info(pc_tree->split[0], mi_grid_visible, mi_stride,
+ store_superblock_info(pc_tree->u.split[0], mi_grid_visible, mi_stride,
subblock_square_size_4x4, num_unit_rows, num_unit_cols,
row_start_4x4, col_start_4x4, partition_info,
motion_vector_info);
- store_superblock_info(pc_tree->split[1], mi_grid_visible, mi_stride,
+ store_superblock_info(pc_tree->u.split[1], mi_grid_visible, mi_stride,
subblock_square_size_4x4, num_unit_rows, num_unit_cols,
row_start_4x4, col_start_4x4 + subblock_square_size_4x4,
partition_info, motion_vector_info);
- store_superblock_info(pc_tree->split[2], mi_grid_visible, mi_stride,
+ store_superblock_info(pc_tree->u.split[2], mi_grid_visible, mi_stride,
subblock_square_size_4x4, num_unit_rows, num_unit_cols,
row_start_4x4 + subblock_square_size_4x4, col_start_4x4,
partition_info, motion_vector_info);
- store_superblock_info(pc_tree->split[3], mi_grid_visible, mi_stride,
+ store_superblock_info(pc_tree->u.split[3], mi_grid_visible, mi_stride,
subblock_square_size_4x4, num_unit_rows, num_unit_cols,
row_start_4x4 + subblock_square_size_4x4,
col_start_4x4 + subblock_square_size_4x4,
@@ -4114,7 +4114,7 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
vp9_zero(pc_tree->mv);
}
if (bsize > BLOCK_8X8) { // Store MV result as reference for subblocks.
- for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv;
+ for (i = 0; i < 4; ++i) pc_tree->u.split[i]->mv = pc_tree->mv;
}
}
@@ -4199,25 +4199,25 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
// PARTITION_SPLIT
// TODO(jingning): use the motion vectors given by the above search as
// the starting point of motion search in the following partition type check.
- pc_tree->split[0]->none.rdcost = 0;
- pc_tree->split[1]->none.rdcost = 0;
- pc_tree->split[2]->none.rdcost = 0;
- pc_tree->split[3]->none.rdcost = 0;
+ pc_tree->u.split[0]->none.rdcost = 0;
+ pc_tree->u.split[1]->none.rdcost = 0;
+ pc_tree->u.split[2]->none.rdcost = 0;
+ pc_tree->u.split[3]->none.rdcost = 0;
if (do_split || must_split) {
subsize = get_subsize(bsize, PARTITION_SPLIT);
load_pred_mv(x, ctx);
if (bsize == BLOCK_8X8) {
i = 4;
if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
- pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter;
+ pc_tree->u.leaf_split[0]->pred_interp_filter = pred_interp_filter;
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
- pc_tree->leaf_split[0], best_rdc.rate, best_rdc.dist);
+ pc_tree->u.leaf_split[0], best_rdc.rate, best_rdc.dist);
if (sum_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
} else {
if (cpi->sf.prune_ref_frame_for_rect_partitions) {
- const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0];
- const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1];
+ const int ref1 = pc_tree->u.leaf_split[0]->mic.ref_frame[0];
+ const int ref2 = pc_tree->u.leaf_split[0]->mic.ref_frame[1];
for (i = 0; i < 4; ++i) {
ref_frames_used[i] |= (1 << ref1);
if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
@@ -4250,21 +4250,21 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
continue;
- pc_tree->split[i]->index = i;
+ pc_tree->u.split[i]->index = i;
if (cpi->sf.prune_ref_frame_for_rect_partitions)
- pc_tree->split[i]->none.rate = INT_MAX;
+ pc_tree->u.split[i]->none.rate = INT_MAX;
found_best_rd = rd_pick_partition(
cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
- &this_rdc, best_rdc_split, pc_tree->split[i]);
+ &this_rdc, best_rdc_split, pc_tree->u.split[i]);
if (found_best_rd == 0) {
sum_rdc.rdcost = INT64_MAX;
break;
} else {
if (cpi->sf.prune_ref_frame_for_rect_partitions &&
- pc_tree->split[i]->none.rate != INT_MAX) {
- const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0];
- const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1];
+ pc_tree->u.split[i]->none.rate != INT_MAX) {
+ const int ref1 = pc_tree->u.split[i]->none.mic.ref_frame[0];
+ const int ref2 = pc_tree->u.split[i]->none.mic.ref_frame[1];
ref_frames_used[i] |= (1 << ref1);
if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
}
@@ -4821,13 +4821,13 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
}
break;
case PARTITION_SPLIT: {
- fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+ fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->u.split[0]);
fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
- pc_tree->split[1]);
+ pc_tree->u.split[1]);
fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
- pc_tree->split[2]);
+ pc_tree->u.split[2]);
fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
- pc_tree->split[3]);
+ pc_tree->u.split[3]);
break;
}
default: break;
@@ -4845,7 +4845,8 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
if (bsize > BLOCK_8X8) {
BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
int i;
- for (i = 0; i < 4; ++i) pred_pixel_ready_reset(pc_tree->split[i], subsize);
+ for (i = 0; i < 4; ++i)
+ pred_pixel_ready_reset(pc_tree->u.split[i], subsize);
}
}
@@ -5046,9 +5047,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
continue;
load_pred_mv(x, ctx);
- nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
- mi_col + x_idx, subsize, &this_rdc, 0,
- best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+ nonrd_pick_partition(
+ cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ &this_rdc, 0, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->u.split[i]);
if (this_rdc.rate == INT_MAX) {
vp9_rd_cost_reset(&sum_rdc);
@@ -5281,10 +5282,10 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
subsize = get_subsize(bsize, PARTITION_SPLIT);
nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
subsize, output_enabled, rd_cost,
- pc_tree->split[0]);
+ pc_tree->u.split[0]);
nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
mi_col + hbs, subsize, output_enabled, &this_rdc,
- pc_tree->split[1]);
+ pc_tree->u.split[1]);
if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
rd_cost->rate += this_rdc.rate;
@@ -5292,7 +5293,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
}
nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp,
mi_row + hbs, mi_col, subsize, output_enabled,
- &this_rdc, pc_tree->split[2]);
+ &this_rdc, pc_tree->u.split[2]);
if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
rd_cost->rate += this_rdc.rate;
@@ -5300,7 +5301,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
}
nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
mi_row + hbs, mi_col + hbs, subsize,
- output_enabled, &this_rdc, pc_tree->split[3]);
+ output_enabled, &this_rdc, pc_tree->u.split[3]);
if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
rd_cost->rate += this_rdc.rate;
@@ -5400,21 +5401,21 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
subsize = get_subsize(bsize, PARTITION_SPLIT);
if (bsize == BLOCK_8X8) {
nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
- subsize, pc_tree->leaf_split[0]);
+ subsize, pc_tree->u.leaf_split[0]);
encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
- subsize, pc_tree->leaf_split[0]);
+ subsize, pc_tree->u.leaf_split[0]);
} else {
nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize,
- output_enabled, dummy_cost, pc_tree->split[0]);
+ output_enabled, dummy_cost, pc_tree->u.split[0]);
nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
mi_col + hbs, subsize, output_enabled, dummy_cost,
- pc_tree->split[1]);
+ pc_tree->u.split[1]);
nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
mi_row + hbs, mi_col, subsize, output_enabled,
- dummy_cost, pc_tree->split[2]);
+ dummy_cost, pc_tree->u.split[2]);
nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
mi_row + hbs, mi_col + hbs, subsize, output_enabled,
- dummy_cost, pc_tree->split[3]);
+ dummy_cost, pc_tree->u.split[3]);
}
break;
}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
index fd213f1e6b..3b8b5345f1 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
@@ -31,12 +31,14 @@
#include "vpx_ports/system_state.h"
#include "vpx_ports/vpx_once.h"
#include "vpx_ports/vpx_timer.h"
+#include "vpx_util/vpx_pthread.h"
#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
#include "vpx_util/vpx_debug_util.h"
#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_enums.h"
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_idct.h"
#if CONFIG_VP9_POSTPROC
@@ -2135,24 +2137,22 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
cpi->external_resize = 1;
}
- if (cpi->initial_width) {
- int new_mi_size = 0;
- vp9_set_mb_mi(cm, cm->width, cm->height);
- new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
- if (cm->mi_alloc_size < new_mi_size) {
- vp9_free_context_buffers(cm);
- vp9_free_pc_tree(&cpi->td);
- vpx_free(cpi->mbmi_ext_base);
- alloc_compressor_data(cpi);
- realloc_segmentation_maps(cpi);
- cpi->initial_width = cpi->initial_height = 0;
- cpi->external_resize = 0;
- } else if (cm->mi_alloc_size == new_mi_size &&
- (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
- if (vp9_alloc_loop_filter(cm)) {
- vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate loop filter data");
- }
+ int new_mi_size = 0;
+ vp9_set_mb_mi(cm, cm->width, cm->height);
+ new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+ if (cm->mi_alloc_size < new_mi_size) {
+ vp9_free_context_buffers(cm);
+ vp9_free_pc_tree(&cpi->td);
+ vpx_free(cpi->mbmi_ext_base);
+ alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->initial_width = cpi->initial_height = 0;
+ cpi->external_resize = 0;
+ } else if (cm->mi_alloc_size == new_mi_size &&
+ (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
+ if (vp9_alloc_loop_filter(cm)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate loop filter data");
}
}
@@ -3472,7 +3472,6 @@ void vp9_scale_references(VP9_COMP *cpi) {
continue;
}
-#if CONFIG_VP9_HIGHBITDEPTH
if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
RefCntBuffer *new_fb_ptr = NULL;
int force_scaling = 0;
@@ -3485,6 +3484,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
new_fb_ptr = &pool->frame_bufs[new_fb];
if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
new_fb_ptr->buf.y_crop_height != cm->height) {
+#if CONFIG_VP9_HIGHBITDEPTH
if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
cm->use_highbitdepth,
@@ -3494,22 +3494,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
"Failed to allocate frame buffer");
scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
EIGHTTAP, 0);
- cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
- alloc_frame_mvs(cm, new_fb);
- }
#else
- if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
- RefCntBuffer *new_fb_ptr = NULL;
- int force_scaling = 0;
- int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
- if (new_fb == INVALID_IDX) {
- new_fb = get_free_fb(cm);
- force_scaling = 1;
- }
- if (new_fb == INVALID_IDX) return;
- new_fb_ptr = &pool->frame_bufs[new_fb];
- if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
- new_fb_ptr->buf.y_crop_height != cm->height) {
if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
VP9_ENC_BORDER_IN_PIXELS,
@@ -3517,10 +3502,10 @@ void vp9_scale_references(VP9_COMP *cpi) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0);
+#endif // CONFIG_VP9_HIGHBITDEPTH
cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
alloc_frame_mvs(cm, new_fb);
}
-#endif // CONFIG_VP9_HIGHBITDEPTH
} else {
int buf_idx;
RefCntBuffer *buf = NULL;
@@ -3958,6 +3943,35 @@ static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
#endif
}
+static YV12_BUFFER_CONFIG *svc_twostage_scale(
+ VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+ int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
+ if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+ cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->bit_depth == VPX_BITS_8) {
+ vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+ phase_scaler2);
+ vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
+ phase_scaler);
+ } else {
+ scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
+ filter_type2, phase_scaler2);
+ scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
+ filter_type, phase_scaler);
+ }
+#else
+ vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+ phase_scaler2);
+ vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return scaled;
+ } else {
+ return unscaled;
+ }
+}
+
static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
uint8_t *dest) {
VP9_COMMON *const cm = &cpi->common;
@@ -4000,7 +4014,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
// result will be saved in scaled_temp and might be used later.
const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1];
const int phase_scaler2 = svc->downsample_filter_phase[1];
- cpi->Source = vp9_svc_twostage_scale(
+ cpi->Source = svc_twostage_scale(
cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
svc->scaled_one_half = 1;
@@ -4486,21 +4500,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
// external rate control model.
// This flag doesn't have any impact when external rate control is not used.
int ext_rc_recode = 0;
- // Maximal frame size allowed by the external rate control.
- // case: 0, we ignore the max frame size limit, and encode with the qindex
- // passed in by the external rate control model.
- // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex
- // and may recode if undershoot/overshoot is seen.
- // If the external qindex is not VPX_DEFAULT_Q, we force no recode.
- // case: -1, we take libvpx's decision for the max frame size, as well as
- // the recode decision.
- // Otherwise: if a specific size is given, libvpx's recode decision
- // will respect the given size.
- int ext_rc_max_frame_size = 0;
- // Use VP9's decision of qindex. This flag is in use only in external rate
- // control model to help determine whether to recode when
- // |ext_rc_max_frame_size| is 0.
- int ext_rc_use_default_q = 1;
const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
#if CONFIG_RATE_CTRL
@@ -4616,27 +4615,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
}
#endif // CONFIG_RATE_CTRL
if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
+ !cpi->tpl_with_external_rc &&
(cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
vpx_codec_err_t codec_status;
const GF_GROUP *gf_group = &cpi->twopass.gf_group;
vpx_rc_encodeframe_decision_t encode_frame_decision;
- FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
- const int ref_frame_flags = get_ref_frame_flags(cpi);
- RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
- const RefCntBuffer *curr_frame_buf =
- get_ref_cnt_buffer(cm, cm->new_fb_idx);
- // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
- // index 1 refers to the first encoding frame in a gf group.
- // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
- // See function define_gf_group_structure().
- const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
- get_ref_frame_bufs(cpi, ref_frame_bufs);
codec_status = vp9_extrc_get_encodeframe_decision(
- &cpi->ext_ratectrl, curr_frame_buf->frame_index,
- cm->current_frame_coding_index, gf_group->index, update_type,
- gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
- &encode_frame_decision);
+ &cpi->ext_ratectrl, gf_group->index, &encode_frame_decision);
if (codec_status != VPX_CODEC_OK) {
vpx_internal_error(&cm->error, codec_status,
"vp9_extrc_get_encodeframe_decision() failed");
@@ -4645,9 +4631,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
// libvpx's default q.
if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
q = encode_frame_decision.q_index;
- ext_rc_use_default_q = 0;
}
- ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
}
vp9_set_quantizer(cpi, q);
@@ -4690,21 +4674,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
if (cpi->ext_ratectrl.ready &&
(cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
- // In general, for the external rate control, we take the qindex provided
- // as input and encode the frame with this qindex faithfully. However,
- // in some extreme scenarios, the provided qindex leads to a massive
- // overshoot of frame size. In this case, we fall back to VP9's decision
- // to pick a new qindex and recode the frame. We return the new qindex
- // through the API to the external model.
- if (ext_rc_max_frame_size == 0) {
- if (!ext_rc_use_default_q) break;
- } else if (ext_rc_max_frame_size == -1) {
- // Do nothing, fall back to libvpx's recode decision.
- } else {
- // Change the max frame size, used in libvpx's recode decision.
- rc->max_frame_bandwidth = ext_rc_max_frame_size;
- }
- ext_rc_recode = 1;
+ break;
}
#if CONFIG_RATE_CTRL
if (cpi->oxcf.use_simple_encode_api) {
@@ -4974,35 +4944,6 @@ static void set_ext_overrides(VP9_COMP *cpi) {
}
}
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
- VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
- YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
- int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
- if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
- cm->mi_rows * MI_SIZE != unscaled->y_height) {
-#if CONFIG_VP9_HIGHBITDEPTH
- if (cm->bit_depth == VPX_BITS_8) {
- vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
- phase_scaler2);
- vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
- phase_scaler);
- } else {
- scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
- filter_type2, phase_scaler2);
- scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
- filter_type, phase_scaler);
- }
-#else
- vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
- phase_scaler2);
- vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
-#endif // CONFIG_VP9_HIGHBITDEPTH
- return scaled;
- } else {
- return unscaled;
- }
-}
-
YV12_BUFFER_CONFIG *vp9_scale_if_required(
VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) {
@@ -6429,7 +6370,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
if (arf_src_index) {
- assert(arf_src_index <= rc->frames_to_key);
+ if (!(cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+ cpi->ext_ratectrl.funcs.get_gop_decision != NULL)) {
+ // This assert only makes sense when not using external RC.
+ assert(arf_src_index <= rc->frames_to_key);
+ }
if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
cpi->alt_ref_source = source;
@@ -6617,7 +6563,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
cpi->sf.enable_tpl_model) {
vp9_init_tpl_buffer(cpi);
- vp9_estimate_qp_gop(cpi);
+ vp9_estimate_tpl_qp_gop(cpi);
vp9_setup_tpl_stats(cpi);
}
#if CONFIG_COLLECT_COMPONENT_TIMING
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
index 91df538821..898855d10d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
@@ -25,6 +25,7 @@
#include "vpx_dsp/variance.h"
#include "vpx_dsp/psnr.h"
#include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_pthread.h"
#include "vpx_util/vpx_thread.h"
#include "vpx_util/vpx_timestamp.h"
@@ -1062,7 +1063,7 @@ typedef struct VP9_COMP {
*/
uint64_t frame_component_time[kTimingComponents];
#endif
- // Flag to indicate if QP and GOP for TPL is controlled by external RC.
+ // Flag to indicate if QP and GOP for TPL are controlled by external RC.
int tpl_with_external_rc;
} VP9_COMP;
@@ -1395,11 +1396,6 @@ void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst);
#endif // CONFIG_VP9_HIGHBITDEPTH
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
- VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
- YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
- int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2);
-
YV12_BUFFER_CONFIG *vp9_scale_if_required(
VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
index a8d1cb7a7a..c3b79507e6 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
@@ -17,6 +17,7 @@
#include "vp9/encoder/vp9_multi_thread.h"
#include "vp9/encoder/vp9_temporal_filter.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/vpx_pthread.h"
static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
int i, j, k, l, m, n;
@@ -55,7 +56,7 @@ static int enc_worker_hook(void *arg1, void *unused) {
vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
}
- return 0;
+ return 1;
}
static int get_max_tile_cols(VP9_COMP *cpi) {
@@ -106,6 +107,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
++cpi->num_workers;
winterface->init(worker);
+ worker->thread_name = "vpx enc worker";
if (i < num_workers - 1) {
thread_data->cpi = cpi;
@@ -204,8 +206,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
create_enc_workers(cpi, num_workers);
for (i = 0; i < num_workers; i++) {
- EncWorkerData *thread_data;
- thread_data = &cpi->tile_thr_data[i];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
// Before encoding a frame, copy the thread data from cpi.
if (thread_data->td != &cpi->td) {
@@ -456,7 +457,7 @@ static int first_pass_worker_hook(void *arg1, void *arg2) {
this_tile, &best_ref_mv, mb_row);
}
}
- return 0;
+ return 1;
}
void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
@@ -543,7 +544,7 @@ static int temporal_filter_worker_hook(void *arg1, void *arg2) {
mb_col_start, mb_col_end);
}
}
- return 0;
+ return 1;
}
void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
@@ -616,7 +617,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *arg2) {
vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row);
}
}
- return 0;
+ return 1;
}
void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
index 4c192da515..359cdd1290 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
@@ -11,13 +11,14 @@
#ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_
#define VPX_VP9_ENCODER_VP9_ETHREAD_H_
+#include "vpx_util/vpx_pthread.h"
+
#ifdef __cplusplus
extern "C" {
#endif
#define MAX_NUM_TILE_COLS (1 << 6)
#define MAX_NUM_TILE_ROWS 4
-#define MAX_NUM_THREADS 80
struct VP9_COMP;
struct ThreadData;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
index 4664e8c5e2..7b0d89acd2 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
@@ -156,32 +156,15 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
}
vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
- EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
- FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
- RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+ EXT_RATECTRL *ext_ratectrl, int gop_index,
vpx_rc_encodeframe_decision_t *encode_frame_decision) {
- if (ext_ratectrl == NULL) {
- return VPX_CODEC_INVALID_PARAM;
- }
- if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) {
- vpx_rc_status_t rc_status;
- vpx_rc_encodeframe_info_t encode_frame_info;
- encode_frame_info.show_index = show_index;
- encode_frame_info.coding_index = coding_index;
- encode_frame_info.gop_index = gop_index;
- encode_frame_info.frame_type = extrc_get_frame_type(update_type);
- encode_frame_info.gop_size = gop_size;
- encode_frame_info.use_alt_ref = use_alt_ref;
-
- vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
- encode_frame_info.ref_frame_coding_indexes,
- encode_frame_info.ref_frame_valid_list);
+ assert(ext_ratectrl != NULL);
+ assert(ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0);
- rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
- ext_ratectrl->model, &encode_frame_info, encode_frame_decision);
- if (rc_status == VPX_RC_ERROR) {
- return VPX_CODEC_ERROR;
- }
+ vpx_rc_status_t rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
+ ext_ratectrl->model, gop_index, encode_frame_decision);
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
}
return VPX_CODEC_OK;
}
@@ -222,29 +205,14 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
}
vpx_codec_err_t vp9_extrc_get_gop_decision(
- EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
- vpx_rc_gop_decision_t *gop_decision) {
+ EXT_RATECTRL *ext_ratectrl, vpx_rc_gop_decision_t *gop_decision) {
vpx_rc_status_t rc_status;
if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
(ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
return VPX_CODEC_INVALID_PARAM;
}
- rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
- gop_info, gop_decision);
- if (gop_decision->use_alt_ref) {
- const int arf_constraint =
- gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
- gop_decision->gop_coding_frames < gop_info->lag_in_frames;
- if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
- }
- // TODO(chengchen): Take min and max gf interval from the model
- // and overwrite libvpx's decision so that we can get rid
- // of one of the checks here.
- if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
- gop_info->max_gf_interval) {
- return VPX_CODEC_ERROR;
- }
+ rc_status =
+ ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, gop_decision);
if (rc_status == VPX_RC_ERROR) {
return VPX_CODEC_ERROR;
}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
index b04580c1d4..d1be5f2aef 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
@@ -39,9 +39,7 @@ vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
const VpxTplGopStats *tpl_gop_stats);
vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
- EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
- FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
- RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+ EXT_RATECTRL *ext_ratectrl, int gop_index,
vpx_rc_encodeframe_decision_t *encode_frame_decision);
vpx_codec_err_t vp9_extrc_update_encodeframe_result(
@@ -50,9 +48,8 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
uint32_t input_bit_depth, const int actual_encoding_qindex);
-vpx_codec_err_t vp9_extrc_get_gop_decision(
- EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
- vpx_rc_gop_decision_t *gop_decision);
+vpx_codec_err_t vp9_extrc_get_gop_decision(EXT_RATECTRL *ext_ratectrl,
+ vpx_rc_gop_decision_t *gop_decision);
vpx_codec_err_t vp9_extrc_get_frame_rdmult(
EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
index dcb62e8768..69261ac65f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
@@ -162,42 +162,3 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
et_uv, el_uv, eb_uv, er_uv, chroma_step);
}
-
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst, int srcy,
- int srcx, int srch, int srcw) {
- // If the side is not touching the bounder then don't extend.
- const int et_y = srcy ? 0 : dst->border;
- const int el_y = srcx ? 0 : dst->border;
- const int eb_y = srcy + srch != src->y_height
- ? 0
- : dst->border + dst->y_height - src->y_height;
- const int er_y = srcx + srcw != src->y_width
- ? 0
- : dst->border + dst->y_width - src->y_width;
- const int src_y_offset = srcy * src->y_stride + srcx;
- const int dst_y_offset = srcy * dst->y_stride + srcx;
-
- const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
- const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
- const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
- const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
- const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
- const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
- const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
- const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
- // detect nv12 colorspace
- const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
-
- copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
- dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
- et_y, el_y, eb_y, er_y, 1);
-
- copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
- dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
- srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step);
-
- copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
- dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
- srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step);
-}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
index 4ba7fc95e3..21d7e68b9f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
@@ -21,9 +21,6 @@ extern "C" {
void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst);
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst, int srcy,
- int srcx, int srch, int srcw);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
index a9cdf5353f..58b9b7ba61 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
@@ -37,6 +37,7 @@
#include "vp9/encoder/vp9_mcomp.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rd.h"
+#include "vpx/vpx_ext_ratectrl.h"
#include "vpx_dsp/variance.h"
#define OUTPUT_FPF 0
@@ -1164,7 +1165,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
v_fn_ptr.vf = get_block_variance_fn(bsize);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8);
+ v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
this_motion_error =
@@ -2769,38 +2770,6 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
}
}
#endif
- // If the external rate control model for GOP is used, the gop decisions
- // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
- // will be overwritten.
- if (cpi->ext_ratectrl.ready &&
- (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
- cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) {
- vpx_codec_err_t codec_status;
- vpx_rc_gop_decision_t gop_decision;
- vpx_rc_gop_info_t gop_info;
- gop_info.min_gf_interval = rc->min_gf_interval;
- gop_info.max_gf_interval = rc->max_gf_interval;
- gop_info.active_min_gf_interval = active_gf_interval.min;
- gop_info.active_max_gf_interval = active_gf_interval.max;
- gop_info.allow_alt_ref = allow_alt_ref;
- gop_info.is_key_frame = is_key_frame;
- gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active;
- gop_info.frames_since_key = rc->frames_since_key;
- gop_info.frames_to_key = rc->frames_to_key;
- gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
- gop_info.show_index = cm->current_video_frame;
- gop_info.coding_index = cm->current_frame_coding_index;
- gop_info.gop_global_index = rc->gop_global_index;
-
- codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
- &gop_decision);
- if (codec_status != VPX_CODEC_OK) {
- vpx_internal_error(&cm->error, codec_status,
- "vp9_extrc_get_gop_decision() failed");
- }
- gop_coding_frames = gop_decision.gop_coding_frames;
- use_alt_ref = gop_decision.use_alt_ref;
- }
// Was the group length constrained by the requirement for a new KF?
rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0;
@@ -3600,32 +3569,71 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
else
twopass->fr_content_type = FC_NORMAL;
- // Keyframe and section processing.
- if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
- // Define next KF group and assign bits to it.
- find_next_key_frame(cpi, show_idx);
+ // If the external rate control model for GOP is used, the gop decisions
+ // are overwritten, including whether to use key frame in this GF group,
+ // GF group length, and whether to use arf.
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+ cpi->ext_ratectrl.funcs.get_gop_decision != NULL &&
+ rc->frames_till_gf_update_due == 0) {
+ vpx_codec_err_t codec_status;
+ vpx_rc_gop_decision_t gop_decision;
+ codec_status =
+ vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_decision);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_get_gop_decision() failed");
+ }
+ if (gop_decision.use_key_frame) {
+ cpi->common.frame_type = KEY_FRAME;
+ rc->frames_since_key = 0;
+ // Clear the alt ref active flag and last group multi arf flags as they
+ // can never be set for a key frame.
+ rc->source_alt_ref_active = 0;
+ // KF is always a GF so clear frames till next gf counter.
+ rc->frames_till_gf_update_due = 0;
+ }
+
+ // A new GF group
+ if (rc->frames_till_gf_update_due == 0) {
+ vp9_zero(twopass->gf_group);
+ ++rc->gop_global_index;
+ if (gop_decision.use_alt_ref) {
+ rc->source_alt_ref_pending = 1;
+ }
+ rc->baseline_gf_interval =
+ gop_decision.gop_coding_frames - rc->source_alt_ref_pending;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ define_gf_group_structure(cpi);
+ }
} else {
- cm->frame_type = INTER_FRAME;
- }
+ // Keyframe and section processing.
+ if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+ // Define next KF group and assign bits to it.
+ find_next_key_frame(cpi, show_idx);
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
- // Define a new GF/ARF group. (Should always enter here for key frames).
- if (rc->frames_till_gf_update_due == 0) {
- define_gf_group(cpi, show_idx);
+ // Define a new GF/ARF group. (Should always enter here for key frames).
+ if (rc->frames_till_gf_update_due == 0) {
+ define_gf_group(cpi, show_idx);
- rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
#if ARF_STATS_OUTPUT
- {
- FILE *fpfile;
- fpfile = fopen("arf.stt", "a");
- ++arf_count;
- fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
- cm->current_video_frame, rc->frames_till_gf_update_due,
- rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type);
-
- fclose(fpfile);
- }
+ {
+ FILE *fpfile;
+ fpfile = fopen("arf.stt", "a");
+ ++arf_count;
+ fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
+ cm->current_video_frame, rc->frames_till_gf_update_due,
+ rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type);
+
+ fclose(fpfile);
+ }
#endif
+ }
}
vp9_configure_buffer_updates(cpi, gf_group->index);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
index 97838c38e6..b6be4f88ac 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
@@ -9,6 +9,7 @@
*/
#include <assert.h>
#include <stdlib.h>
+#include <string.h>
#include "./vpx_config.h"
@@ -81,7 +82,6 @@ bail:
return NULL;
}
-#define USE_PARTIAL_COPY 0
int vp9_lookahead_full(const struct lookahead_ctx *ctx) {
return ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz;
}
@@ -94,11 +94,6 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
int64_t ts_start, int64_t ts_end, int use_highbitdepth,
vpx_enc_frame_flags_t flags) {
struct lookahead_entry *buf;
-#if USE_PARTIAL_COPY
- int row, col, active_end;
- int mb_rows = (src->y_height + 15) >> 4;
- int mb_cols = (src->y_width + 15) >> 4;
-#endif
int width = src->y_crop_width;
int height = src->y_crop_height;
int uv_width = src->uv_crop_width;
@@ -119,76 +114,36 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
height != buf->img.y_crop_height ||
uv_width != buf->img.uv_crop_width ||
uv_height != buf->img.uv_crop_height;
- larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
- uv_width > buf->img.uv_width ||
- uv_height > buf->img.uv_height;
+ larger_dimensions =
+ width > buf->img.y_crop_width || height > buf->img.y_crop_height ||
+ uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height;
assert(!larger_dimensions || new_dimensions);
-#if USE_PARTIAL_COPY
- // TODO(jkoleszar): This is disabled for now, as
- // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
-
- // Only do this partial copy if the following conditions are all met:
- // 1. Lookahead queue has has size of 1.
- // 2. Active map is provided.
- // 3. This is not a key frame, golden nor altref frame.
- if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
- for (row = 0; row < mb_rows; ++row) {
- col = 0;
-
- while (1) {
- // Find the first active macroblock in this row.
- for (; col < mb_cols; ++col) {
- if (active_map[col]) break;
- }
-
- // No more active macroblock in this row.
- if (col == mb_cols) break;
-
- // Find the end of active region in this row.
- active_end = col;
-
- for (; active_end < mb_cols; ++active_end) {
- if (!active_map[active_end]) break;
- }
-
- // Only copy this active region.
- vp9_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
- 16, (active_end - col) << 4);
-
- // Start again from the end of this active region.
- col = active_end;
- }
-
- active_map += mb_cols;
- }
- } else {
-#endif
- if (larger_dimensions) {
- YV12_BUFFER_CONFIG new_img;
- memset(&new_img, 0, sizeof(new_img));
- if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
- subsampling_y,
+ if (larger_dimensions) {
+ YV12_BUFFER_CONFIG new_img;
+ memset(&new_img, 0, sizeof(new_img));
+ if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+ subsampling_y,
#if CONFIG_VP9_HIGHBITDEPTH
- use_highbitdepth,
+ use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, 0))
- return 1;
- vpx_free_frame_buffer(&buf->img);
- buf->img = new_img;
- } else if (new_dimensions) {
- buf->img.y_crop_width = src->y_crop_width;
- buf->img.y_crop_height = src->y_crop_height;
- buf->img.uv_crop_width = src->uv_crop_width;
- buf->img.uv_crop_height = src->uv_crop_height;
- buf->img.subsampling_x = src->subsampling_x;
- buf->img.subsampling_y = src->subsampling_y;
- }
- // Partial copy not implemented yet
- vp9_copy_and_extend_frame(src, &buf->img);
-#if USE_PARTIAL_COPY
+ VP9_ENC_BORDER_IN_PIXELS, 0))
+ return 1;
+ vpx_free_frame_buffer(&buf->img);
+ buf->img = new_img;
+ } else if (new_dimensions) {
+ buf->img.y_width = src->y_width;
+ buf->img.y_height = src->y_height;
+ buf->img.uv_width = src->uv_width;
+ buf->img.uv_height = src->uv_height;
+ buf->img.y_crop_width = src->y_crop_width;
+ buf->img.y_crop_height = src->y_crop_height;
+ buf->img.uv_crop_width = src->uv_crop_width;
+ buf->img.uv_crop_height = src->uv_crop_height;
+ buf->img.subsampling_x = src->subsampling_x;
+ buf->img.subsampling_y = src->subsampling_y;
}
-#endif
+ vp9_copy_and_extend_frame(src, &buf->img);
buf->ts_start = ts_start;
buf->ts_end = ts_end;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
index 0843cd97e4..6e124f9944 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
@@ -10,6 +10,7 @@
#include <assert.h>
+#include "vpx_util/vpx_pthread.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_multi_thread.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
index 3f4fe6957b..d37e020b0a 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
@@ -12,6 +12,7 @@
#include <math.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/bitops.h"
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_quant_common.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
index 62d6b93028..76d5435e60 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -35,6 +35,7 @@
#include "vp9/encoder/vp9_ext_ratectrl.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
#include "vpx/vpx_codec.h"
#include "vpx/vpx_ext_ratectrl.h"
@@ -1433,8 +1434,8 @@ static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index,
return q;
}
-static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
- int *top_index, int gf_group_index) {
+int vp9_rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
+ int *top_index, int gf_group_index) {
const VP9_COMMON *const cm = &cpi->common;
const RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1581,7 +1582,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
q = active_worst_quality;
}
}
- clamp(q, active_best_quality, active_worst_quality);
*top_index = active_worst_quality;
*bottom_index = active_best_quality;
@@ -1603,8 +1603,8 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index,
else
q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
} else {
- q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
- gf_group_index);
+ q = vp9_rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
+ gf_group_index);
}
if (cpi->sf.use_nonrd_pick_mode) {
if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex;
@@ -1675,63 +1675,6 @@ void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) {
}
}
-void vp9_estimate_qp_gop(VP9_COMP *cpi) {
- int gop_length = cpi->twopass.gf_group.gf_group_size;
- int bottom_index, top_index;
- int idx;
- const int gf_index = cpi->twopass.gf_group.index;
- const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
- const int refresh_frame_context = cpi->common.refresh_frame_context;
-
- for (idx = 1; idx <= gop_length; ++idx) {
- TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
- int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
- cpi->twopass.gf_group.index = idx;
- vp9_rc_set_frame_target(cpi, target_rate);
- vp9_configure_buffer_updates(cpi, idx);
- if (cpi->tpl_with_external_rc) {
- if (cpi->ext_ratectrl.ready &&
- (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
- cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
- VP9_COMMON *cm = &cpi->common;
- vpx_codec_err_t codec_status;
- const GF_GROUP *gf_group = &cpi->twopass.gf_group;
- vpx_rc_encodeframe_decision_t encode_frame_decision;
- FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
- RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
- const RefCntBuffer *curr_frame_buf =
- get_ref_cnt_buffer(cm, cm->new_fb_idx);
- // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
- // index 1 refers to the first encoding frame in a gf group.
- // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
- // See function define_gf_group_structure().
- const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
- const int frame_coding_index = cm->current_frame_coding_index + idx - 1;
- get_ref_frame_bufs(cpi, ref_frame_bufs);
- codec_status = vp9_extrc_get_encodeframe_decision(
- &cpi->ext_ratectrl, curr_frame_buf->frame_index, frame_coding_index,
- gf_group->index, update_type, gf_group->gf_group_size, use_alt_ref,
- ref_frame_bufs, 0 /*ref_frame_flags is not used*/,
- &encode_frame_decision);
- if (codec_status != VPX_CODEC_OK) {
- vpx_internal_error(&cm->error, codec_status,
- "vp9_extrc_get_encodeframe_decision() failed");
- }
- tpl_frame->base_qindex = encode_frame_decision.q_index;
- }
- } else {
- tpl_frame->base_qindex =
- rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx);
- tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
- }
- }
- // Reset the actual index and frame update
- cpi->twopass.gf_group.index = gf_index;
- cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
- cpi->common.refresh_frame_context = refresh_frame_context;
- vp9_configure_buffer_updates(cpi, gf_index);
-}
-
void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target,
int *frame_under_shoot_limit,
int *frame_over_shoot_limit) {
@@ -3361,14 +3304,20 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
}
// For temporal layers, reset the rate control parametes across all
- // temporal layers. If the first_spatial_layer_to_encode > 0, then this
- // superframe has skipped lower base layers. So in this case we should also
- // reset and force max-q for spatial layers < first_spatial_layer_to_encode.
+ // temporal layers.
+ // If the first_spatial_layer_to_encode > 0, then this superframe has
+ // skipped lower base layers. So in this case we should also reset and
+ // force max-q for spatial layers < first_spatial_layer_to_encode.
+ // For the case of no inter-layer prediction on delta frames: reset and
+ // force max-q for all spatial layers, to avoid excessive frame drops.
if (cpi->use_svc) {
int tl = 0;
int sl = 0;
SVC *svc = &cpi->svc;
- for (sl = 0; sl < VPXMAX(1, svc->first_spatial_layer_to_encode); ++sl) {
+ int num_spatial_layers = VPXMAX(1, svc->first_spatial_layer_to_encode);
+ if (svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON)
+ num_spatial_layers = svc->number_spatial_layers;
+ for (sl = 0; sl < num_spatial_layers; ++sl) {
for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
const int layer =
LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
index 48c49e937e..0c61ad3461 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -346,12 +346,14 @@ int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q);
void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index);
-void vp9_estimate_qp_gop(struct VP9_COMP *cpi);
-
void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi);
void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi);
+int vp9_rc_pick_q_and_bounds_two_pass(const struct VP9_COMP *cpi,
+ int *bottom_index, int *top_index,
+ int gf_group_index);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
index 974e43c90f..447136ed84 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
@@ -1834,7 +1834,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
return 1;
}
-static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) {
+static INLINE int skip_iters(int_mv iter_mvs[][2], int ite, int id) {
if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) {
int_mv cur_fullpel_mv, prev_fullpel_mv;
cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
index b8910370e0..048ab8732d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
@@ -18,9 +18,12 @@
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_scan.h"
#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_tpl_model.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
const GF_GROUP *gf_group, int *tpl_group_frames) {
@@ -407,8 +410,12 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
tpl_block_stats_ptr->col = mi_col * 8;
tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
- tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
- tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+ // inter/intra_cost here is calculated with SATD which should be close
+ // enough to be used as inter/intra_pred_error
+ tpl_block_stats_ptr->inter_pred_err = src_stats->inter_cost;
+ tpl_block_stats_ptr->intra_pred_err = src_stats->intra_cost;
+ tpl_block_stats_ptr->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_block_stats_ptr->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row;
tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col;
tpl_block_stats_ptr->ref_frame_index = ref_frame_idx;
@@ -721,7 +728,9 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
tpl_stats->intra_cost = VPXMAX(
1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
- tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+ if (best_rf_idx >= 0) {
+ tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+ }
tpl_stats->mv.as_int = best_mv.as_int;
*ref_frame_idx = best_rf_idx;
}
@@ -1489,6 +1498,53 @@ static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
}
#endif // CONFIG_RATE_CTRL
+void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi) {
+ int gop_length = cpi->twopass.gf_group.gf_group_size;
+ int bottom_index, top_index;
+ int idx;
+ const int gf_index = cpi->twopass.gf_group.index;
+ const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
+ const int refresh_frame_context = cpi->common.refresh_frame_context;
+
+ for (idx = 1; idx <= gop_length; ++idx) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
+ int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
+ cpi->twopass.gf_group.index = idx;
+ vp9_rc_set_frame_target(cpi, target_rate);
+ vp9_configure_buffer_updates(cpi, idx);
+ if (cpi->tpl_with_external_rc) {
+ VP9_COMMON *cm = &cpi->common;
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+ cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
+ vpx_codec_err_t codec_status;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ vpx_rc_encodeframe_decision_t encode_frame_decision;
+ codec_status = vp9_extrc_get_encodeframe_decision(
+ &cpi->ext_ratectrl, gf_group->index - 1, &encode_frame_decision);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_get_encodeframe_decision() failed");
+ }
+ tpl_frame->base_qindex = encode_frame_decision.q_index;
+ } else {
+ vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+ "The external rate control library is not set "
+ "properly for TPL pass.");
+ }
+ } else {
+ tpl_frame->base_qindex = vp9_rc_pick_q_and_bounds_two_pass(
+ cpi, &bottom_index, &top_index, idx);
+ tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
+ }
+ }
+ // Reset the actual index and frame update
+ cpi->twopass.gf_group.index = gf_index;
+ cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
+ cpi->common.refresh_frame_context = refresh_frame_context;
+ vp9_configure_buffer_updates(cpi, gf_index);
+}
+
void vp9_setup_tpl_stats(VP9_COMP *cpi) {
GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
const GF_GROUP *gf_group = &cpi->twopass.gf_group;
@@ -1512,12 +1568,16 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) {
mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
}
- // TPL stats has extra frames from next GOP. Trim those extra frames for
- // Qmode.
- trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count);
-
if (cpi->ext_ratectrl.ready &&
cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) {
+ // Intra search on key frame
+ if (gf_picture[0].update_type == KF_UPDATE) {
+ mc_flow_dispenser(cpi, gf_picture, 0, cpi->tpl_bsize);
+ }
+ // TPL stats has extra frames from next GOP. Trim those extra frames for
+ // Qmode.
+ trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats,
+ extended_frame_count);
const vpx_codec_err_t codec_status =
vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats);
if (codec_status != VPX_CODEC_OK) {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
index 04beb22610..de0ac39a1f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
@@ -31,6 +31,7 @@ typedef struct GF_PICTURE {
void vp9_init_tpl_buffer(VP9_COMP *cpi);
void vp9_setup_tpl_stats(VP9_COMP *cpi);
void vp9_free_tpl_buffer(VP9_COMP *cpi);
+void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi);
void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
TX_SIZE tx_size);
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 94506aad0f..628dc4fead 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -886,14 +886,14 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
scale_plane_1_to_2_phase_0(
src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
- scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
- dst->uv_stride, src_w / 2, src_h / 2,
- vp9_filter_kernels[filter_type][8],
- temp_buffer);
- scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
- dst->uv_stride, src_w / 2, src_h / 2,
- vp9_filter_kernels[filter_type][8],
- temp_buffer);
+ const int src_uv_w = src->uv_crop_width;
+ const int src_uv_h = src->uv_crop_height;
+ scale_plane_1_to_2_phase_0(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+ scale_plane_1_to_2_phase_0(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer);
free(temp_buffer);
}
}
diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
index fd81bce7b5..942c15ce49 100644
--- a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
@@ -12,10 +12,12 @@
#include <new>
#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_picklpf.h"
#include "vpx/vp8cx.h"
#include "vpx/vpx_codec.h"
+#include "vpx_mem/vpx_mem.h"
namespace libvpx {
diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.h b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
index 85005c5474..4c39255886 100644
--- a/media/libvpx/libvpx/vp9/ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
@@ -12,43 +12,34 @@
#define VPX_VP9_RATECTRL_RTC_H_
#include <cstdint>
+#include <cstring>
+#include <limits>
#include <memory>
-#include "vp9/common/vp9_enums.h"
-#include "vp9/vp9_iface_common.h"
-#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
-#include "vp9/vp9_cx_iface.h"
+#include "vpx/vpx_encoder.h"
#include "vpx/internal/vpx_ratectrl_rtc.h"
-#include "vpx_mem/vpx_mem.h"
struct VP9_COMP;
namespace libvpx {
struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
- public:
VP9RateControlRtcConfig() {
- ss_number_layers = 1;
- vp9_zero(max_quantizers);
- vp9_zero(min_quantizers);
- vp9_zero(scaling_factor_den);
- vp9_zero(scaling_factor_num);
- vp9_zero(layer_target_bitrate);
- vp9_zero(ts_rate_decimator);
+ memset(layer_target_bitrate, 0, sizeof(layer_target_bitrate));
+ memset(ts_rate_decimator, 0, sizeof(ts_rate_decimator));
scaling_factor_num[0] = 1;
scaling_factor_den[0] = 1;
max_quantizers[0] = max_quantizer;
min_quantizers[0] = min_quantizer;
- max_consec_drop = INT_MAX;
}
// Number of spatial layers
- int ss_number_layers;
- int max_quantizers[VPX_MAX_LAYERS];
- int min_quantizers[VPX_MAX_LAYERS];
- int scaling_factor_num[VPX_SS_MAX_LAYERS];
- int scaling_factor_den[VPX_SS_MAX_LAYERS];
+ int ss_number_layers = 1;
+ int max_quantizers[VPX_MAX_LAYERS] = {};
+ int min_quantizers[VPX_MAX_LAYERS] = {};
+ int scaling_factor_num[VPX_SS_MAX_LAYERS] = {};
+ int scaling_factor_den[VPX_SS_MAX_LAYERS] = {};
// This is only for SVC for now.
- int max_consec_drop;
+ int max_consec_drop = std::numeric_limits<int>::max();
};
struct VP9FrameParamsQpRTC {
@@ -105,9 +96,9 @@ class VP9RateControlRTC {
const VP9FrameParamsQpRTC &frame_params);
private:
- VP9RateControlRTC() {}
+ VP9RateControlRTC() = default;
bool InitRateControl(const VP9RateControlRtcConfig &cfg);
- struct VP9_COMP *cpi_;
+ struct VP9_COMP *cpi_ = nullptr;
};
} // namespace libvpx
diff --git a/media/libvpx/libvpx/vp9/simple_encode.cc b/media/libvpx/libvpx/vp9/simple_encode.cc
index 2e6f9a4513..5e565d1b1a 100644
--- a/media/libvpx/libvpx/vp9/simple_encode.cc
+++ b/media/libvpx/libvpx/vp9/simple_encode.cc
@@ -8,8 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <stdio.h>
+#include <stdlib.h>
+
#include <memory>
#include <vector>
+
#include "./ivfenc.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_enums.h"
@@ -888,6 +892,10 @@ void SimpleEncode::ComputeFirstPassStats() {
use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth;
#endif
vpx_image_t img;
+ if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) {
+ fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n");
+ abort();
+ }
vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1);
rewind(in_file_);
impl_ptr_->first_pass_stats.clear();
@@ -1053,6 +1061,10 @@ void SimpleEncode::StartEncode() {
vp9_set_first_pass_stats(&oxcf, &stats);
assert(impl_ptr_->cpi == nullptr);
impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
+ if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) {
+ fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n");
+ abort();
+ }
vpx_img_alloc(&impl_ptr_->tmp_img, impl_ptr_->img_fmt, frame_width_,
frame_height_, 1);
diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
index 8df04f29f0..fe62bac5f2 100644
--- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <limits.h>
+#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@@ -17,6 +19,7 @@
#include "vpx_dsp/psnr.h"
#include "vpx_ports/static_assert.h"
#include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_thread.h"
#include "vpx_util/vpx_timestamp.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "./vpx_version.h"
@@ -110,7 +113,6 @@ struct vpx_codec_alg_priv {
vpx_codec_priv_t base;
vpx_codec_enc_cfg_t cfg;
struct vp9_extracfg extra_cfg;
- vpx_rational64_t timestamp_ratio;
vpx_codec_pts_t pts_offset;
unsigned char pts_offset_initialized;
VP9EncoderConfig oxcf;
@@ -190,7 +192,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2);
RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1);
RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
- RANGE_CHECK_HI(cfg, g_threads, 64);
+ RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q);
RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
@@ -1140,10 +1142,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
if (res == VPX_CODEC_OK) {
priv->pts_offset_initialized = 0;
- // TODO(angiebird): Replace priv->timestamp_ratio by
- // oxcf->g_timebase_in_ts
- priv->timestamp_ratio = get_g_timebase_in_ts(priv->cfg.g_timebase);
-
set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
#if CONFIG_VP9_HIGHBITDEPTH
priv->oxcf.use_highbitdepth =
@@ -1166,9 +1164,9 @@ static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
return VPX_CODEC_OK;
}
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
- unsigned long duration,
- vpx_enc_deadline_t deadline) {
+static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+ unsigned long duration,
+ vpx_enc_deadline_t deadline) {
MODE new_mode = BEST;
#if CONFIG_REALTIME_ONLY
@@ -1179,13 +1177,16 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
case VPX_RC_ONE_PASS:
if (deadline > 0) {
// Convert duration parameter from stream timebase to microseconds.
- uint64_t duration_us;
-
VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 &&
(TICKS_PER_SEC % 1000000) == 0);
- duration_us = duration * (uint64_t)ctx->timestamp_ratio.num /
- (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
+ if (duration > UINT64_MAX / (uint64_t)ctx->oxcf.g_timebase_in_ts.num) {
+ ERROR("duration is too big");
+ }
+ uint64_t duration_us = duration *
+ (uint64_t)ctx->oxcf.g_timebase_in_ts.num /
+ ((uint64_t)ctx->oxcf.g_timebase_in_ts.den *
+ (TICKS_PER_SEC / 1000000));
// If the deadline is more that the duration this frame is to be shown,
// use good quality mode. Otherwise use realtime mode.
@@ -1208,6 +1209,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
ctx->oxcf.mode = new_mode;
vp9_change_config(ctx->cpi, &ctx->oxcf);
}
+ return VPX_CODEC_OK;
}
// Turn on to test if supplemental superframe data breaks decoding
@@ -1281,6 +1283,10 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
.is_key_frame))
flags |= VPX_FRAME_IS_KEY;
+ if (!cpi->common.show_frame) {
+ flags |= VPX_FRAME_IS_INVISIBLE;
+ }
+
if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE;
return flags;
@@ -1318,7 +1324,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
volatile vpx_enc_frame_flags_t flags = enc_flags;
volatile vpx_codec_pts_t pts = pts_val;
VP9_COMP *const cpi = ctx->cpi;
- const vpx_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio;
+ const vpx_rational64_t *const timebase_in_ts = &ctx->oxcf.g_timebase_in_ts;
size_t data_sz;
vpx_codec_cx_pkt_t pkt;
memset(&pkt, 0, sizeof(pkt));
@@ -1347,13 +1353,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
}
}
- if (!ctx->pts_offset_initialized) {
- ctx->pts_offset = pts;
- ctx->pts_offset_initialized = 1;
+ res = pick_quickcompress_mode(ctx, duration, deadline);
+ if (res != VPX_CODEC_OK) {
+ return res;
}
- pts -= ctx->pts_offset;
-
- pick_quickcompress_mode(ctx, duration, deadline);
vpx_codec_pkt_list_init(&ctx->pkt_list);
// Handle Flags
@@ -1384,20 +1387,53 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
if (res == VPX_CODEC_OK) {
unsigned int lib_flags = 0;
- YV12_BUFFER_CONFIG sd;
- int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts);
size_t size, cx_data_sz;
unsigned char *cx_data;
- cpi->svc.timebase_fac = timebase_units_to_ticks(timestamp_ratio, 1);
- cpi->svc.time_stamp_superframe = dst_time_stamp;
-
// Set up internal flags
if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
if (img != NULL) {
+ YV12_BUFFER_CONFIG sd;
+
+ if (!ctx->pts_offset_initialized) {
+ ctx->pts_offset = pts;
+ ctx->pts_offset_initialized = 1;
+ }
+ if (pts < ctx->pts_offset) {
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "pts is smaller than initial pts");
+ }
+ pts -= ctx->pts_offset;
+ if (pts > INT64_MAX / timebase_in_ts->num) {
+ vpx_internal_error(
+ &cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "conversion of relative pts to ticks would overflow");
+ }
+ const int64_t dst_time_stamp =
+ timebase_units_to_ticks(timebase_in_ts, pts);
+
+ cpi->svc.timebase_fac = timebase_units_to_ticks(timebase_in_ts, 1);
+ cpi->svc.time_stamp_superframe = dst_time_stamp;
+
+#if ULONG_MAX > INT64_MAX
+ if (duration > INT64_MAX) {
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "duration is too big");
+ }
+#endif
+ if (pts > INT64_MAX - (int64_t)duration) {
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "relative pts + duration is too big");
+ }
+ vpx_codec_pts_t pts_end = pts + (int64_t)duration;
+ if (pts_end > INT64_MAX / timebase_in_ts->num) {
+ vpx_internal_error(
+ &cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "conversion of relative pts + duration to ticks would overflow");
+ }
const int64_t dst_end_time_stamp =
- timebase_units_to_ticks(timestamp_ratio, pts + duration);
+ timebase_units_to_ticks(timebase_in_ts, pts_end);
res = image2yuvconfig(img, &sd);
if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
@@ -1434,7 +1470,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
if (cx_data_sz < ctx->cx_data_sz / 2) {
vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR,
"Compressed data buffer too small");
- return VPX_CODEC_ERROR;
}
}
@@ -1443,6 +1478,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
// compute first pass stats
if (img) {
int ret;
+ int64_t dst_time_stamp;
int64_t dst_end_time_stamp;
vpx_codec_cx_pkt_t fps_pkt;
ENCODE_FRAME_RESULT encode_frame_result;
@@ -1469,6 +1505,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
#endif // !CONFIG_REALTIME_ONLY
} else {
ENCODE_FRAME_RESULT encode_frame_result;
+ int64_t dst_time_stamp;
int64_t dst_end_time_stamp;
vp9_init_encode_frame_result(&encode_frame_result);
while (cx_data_sz >= ctx->cx_data_sz / 2 &&
@@ -1507,10 +1544,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
if (ctx->output_cx_pkt_cb.output_cx_pkt) {
pkt.kind = VPX_CODEC_CX_FRAME_PKT;
pkt.data.frame.pts =
- ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+ ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) +
ctx->pts_offset;
pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
- timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+ timebase_in_ts, dst_end_time_stamp - dst_time_stamp);
pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
pkt.data.frame.buf = ctx->pending_cx_data;
pkt.data.frame.sz = size;
@@ -1527,10 +1564,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
// Add the frame packet to the list of returned packets.
pkt.kind = VPX_CODEC_CX_FRAME_PKT;
pkt.data.frame.pts =
- ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+ ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) +
ctx->pts_offset;
pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
- timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+ timebase_in_ts, dst_end_time_stamp - dst_time_stamp);
pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
@@ -1979,6 +2016,7 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
+ ratectrl_config.base_qp = oxcf->cq_level;
if (oxcf->rc_mode == VPX_VBR) {
ratectrl_config.rc_mode = VPX_RC_VBR;
@@ -2223,7 +2261,7 @@ static vpx_codec_enc_cfg_t get_enc_cfg(int frame_width, int frame_height,
return enc_cfg;
}
-static vp9_extracfg get_extra_cfg() {
+static vp9_extracfg get_extra_cfg(void) {
vp9_extracfg extra_cfg = default_extra_cfg;
return extra_cfg;
}
diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/libvpx/vp9/vp9_dx_iface.c
index 860f721dc5..7567910b9b 100644
--- a/media/libvpx/libvpx/vp9/vp9_dx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.c
@@ -19,7 +19,6 @@
#include "vpx/vpx_decoder.h"
#include "vpx_dsp/bitreader_buffer.h"
#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_util/vpx_thread.h"
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_frame_buffers.h"
diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk
index 44790ef6a4..7a0e2d8d1f 100644
--- a/media/libvpx/libvpx/vp9/vp9cx.mk
+++ b/media/libvpx/libvpx/vp9/vp9cx.mk
@@ -140,6 +140,7 @@ endif
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
+VP9_CX_SRCS-$(HAVE_SVE) += encoder/arm/neon/vp9_error_sve.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
diff --git a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
index 01d64b14b7..2643b5578a 100644
--- a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
@@ -22,8 +22,14 @@ enum class FrameDropDecision {
kDrop, // Frame is dropped.
};
+struct UVDeltaQP {
+ // For the UV channel: the QP for the dc/ac value is given as
+ // GetQP() + uvdc/ac_delta_q, where the uvdc/ac_delta_q are negative numbers.
+ int uvdc_delta_q;
+ int uvac_delta_q;
+};
+
struct VpxRateControlRtcConfig {
- public:
VpxRateControlRtcConfig() {
width = 1280;
height = 720;
diff --git a/media/libvpx/libvpx/vpx/src/vpx_encoder.c b/media/libvpx/libvpx/vpx/src/vpx_encoder.c
index 017525aeee..001d854abe 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_encoder.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_encoder.c
@@ -14,6 +14,7 @@
*/
#include <assert.h>
#include <limits.h>
+#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vp8/common/blockd.h"
@@ -184,8 +185,8 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
while (0)
#else
-static void FLOATING_POINT_INIT() {}
-static void FLOATING_POINT_RESTORE() {}
+static void FLOATING_POINT_INIT(void) {}
+static void FLOATING_POINT_RESTORE(void) {}
#endif
vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
@@ -200,6 +201,10 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
res = VPX_CODEC_ERROR;
else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
res = VPX_CODEC_INCAPABLE;
+#if ULONG_MAX > UINT32_MAX
+ else if (duration > UINT32_MAX || deadline > UINT32_MAX)
+ res = VPX_CODEC_INVALID_PARAM;
+#endif
else {
unsigned int num_enc = ctx->priv->enc.total_encoders;
diff --git a/media/libvpx/libvpx/vpx/src/vpx_image.c b/media/libvpx/libvpx/vpx/src/vpx_image.c
index f9f0dd6025..3f7ff74244 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_image.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_image.c
@@ -27,6 +27,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
if (img != NULL) memset(img, 0, sizeof(vpx_image_t));
+ if (fmt == VPX_IMG_FMT_NONE) goto fail;
+
/* Treat align==0 like align==1 */
if (!buf_align) buf_align = 1;
@@ -56,7 +58,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
/* Get chroma shift values for this format */
// For VPX_IMG_FMT_NV12, xcs needs to be 0 such that UV data is all read at
- // one time.
+ // once.
switch (fmt) {
case VPX_IMG_FMT_I420:
case VPX_IMG_FMT_YV12:
diff --git a/media/libvpx/libvpx/vpx/src/vpx_tpl.c b/media/libvpx/libvpx/vpx/src/vpx_tpl.c
index 62c2a9c857..b0687a8135 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_tpl.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_tpl.c
@@ -47,8 +47,8 @@ vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
"%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64
" %" PRId64 " %d\n",
block_stats.inter_cost, block_stats.intra_cost,
- block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist,
- block_stats.recrf_rate, block_stats.ref_frame_index));
+ block_stats.mv_c, block_stats.mv_r, block_stats.srcrf_dist,
+ block_stats.srcrf_rate, block_stats.ref_frame_index));
}
}
@@ -88,7 +88,7 @@ vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
" %" SCNd64 " %d\n",
&block_stats->inter_cost, &block_stats->intra_cost,
&block_stats->mv_c, &block_stats->mv_r,
- &block_stats->recrf_dist, &block_stats->recrf_rate,
+ &block_stats->srcrf_dist, &block_stats->srcrf_rate,
&block_stats->ref_frame_index),
7);
}
diff --git a/media/libvpx/libvpx/vpx/vp8cx.h b/media/libvpx/libvpx/vpx/vp8cx.h
index b12938d3d8..dfdbb3c770 100644
--- a/media/libvpx/libvpx/vpx/vp8cx.h
+++ b/media/libvpx/libvpx/vpx/vp8cx.h
@@ -772,6 +772,8 @@ enum vp8e_enc_control_id {
/*!\brief Codec control to use external RC to control TPL.
*
* This will use external RC to control the QP and GOP structure for TPL.
+ * (rc_type & VPX_RC_QP) in vpx_rc_funcs_t must be non zero.
+ * get_encodeframe_decision callback in vpx_rc_funcs_t also needs to be set.
*
* Supported in codecs: VP9
*/
diff --git a/media/libvpx/libvpx/vpx/vpx_encoder.h b/media/libvpx/libvpx/vpx/vpx_encoder.h
index 18e3862bd7..809a097d94 100644
--- a/media/libvpx/libvpx/vpx/vpx_encoder.h
+++ b/media/libvpx/libvpx/vpx/vpx_encoder.h
@@ -31,7 +31,6 @@ extern "C" {
#include "./vpx_codec.h" // IWYU pragma: export
#include "./vpx_ext_ratectrl.h"
-#include "./vpx_tpl.h"
/*! Temporal Scalability: Maximum length of the sequence defining frame
* layer membership
@@ -57,10 +56,15 @@ extern "C" {
* must be bumped. Examples include, but are not limited to, changing
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
+ *
+ * \note
+ * VPX_ENCODER_ABI_VERSION has a VPX_EXT_RATECTRL_ABI_VERSION component
+ * because the VP9E_SET_EXTERNAL_RATE_CONTROL codec control uses
+ * vpx_rc_funcs_t.
*/
-#define VPX_ENCODER_ABI_VERSION \
- (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \
- VPX_TPL_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION \
+ (18 + VPX_CODEC_ABI_VERSION + \
+ VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
/*! \brief Encoder capabilities bitfield
*
@@ -1074,6 +1078,12 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
* The buffer was set successfully.
* \retval #VPX_CODEC_INVALID_PARAM
* A parameter was NULL, the image format is unsupported, etc.
+ *
+ * \note
+ * `duration` and `deadline` are of the unsigned long type, which can be 32
+ * or 64 bits. `duration` and `deadline` must be less than or equal to
+ * UINT32_MAX so that their ranges are independent of the size of unsigned
+ * long.
*/
vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx,
const vpx_fixed_buf_t *buf,
diff --git a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
index 46d290dff4..ba12e4f83b 100644
--- a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
+++ b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
@@ -26,7 +26,7 @@ extern "C" {
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures.
*/
-#define VPX_EXT_RATECTRL_ABI_VERSION (7)
+#define VPX_EXT_RATECTRL_ABI_VERSION (5 + VPX_TPL_ABI_VERSION)
/*!\brief The control type of the inference API.
* In VPX_RC_QP mode, the external rate control model determines the
@@ -81,17 +81,10 @@ typedef void *vpx_rc_model_t;
*
* The encoder will receive the decision from the external rate control model
* through get_encodeframe_decision() defined in vpx_rc_funcs_t.
- *
- * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q.
- *
- * If max_frame_size = 0, the encoding ignores max frame size limit.
- * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit.
- * If the encoded frame size is larger than max_frame_size, the frame is
- * recoded to meet the size limit, following VP9's recoding principles.
*/
typedef struct vpx_rc_encodeframe_decision {
- int q_index; /**< Quantizer step index [0..255]*/
- int max_frame_size; /**< Maximal frame size allowed to encode a frame*/
+ int q_index; /**< Quantizer step index [0..255]*/
+ int rdmult; /**< Frame level Lagrangian multiplier*/
} vpx_rc_encodeframe_decision_t;
/*!\brief Information for the frame to be encoded.
@@ -322,6 +315,7 @@ typedef struct vpx_rc_config {
vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */
int overshoot_percent; /**< for VBR mode only */
int undershoot_percent; /**< for VBR mode only */
+ int base_qp; /**< base QP for leaf frames, 0-255 */
} vpx_rc_config_t;
/*!\brief Information passed to the external rate control model to
@@ -400,6 +394,7 @@ typedef struct vpx_rc_gop_info {
typedef struct vpx_rc_gop_decision {
int gop_coding_frames; /**< The number of frames of this GOP */
int use_alt_ref; /**< Whether to use alt ref for this GOP */
+ int use_key_frame; /**< Whether to set key frame for this GOP */
} vpx_rc_gop_decision_t;
/*!\brief Create an external rate control model callback prototype
@@ -446,12 +441,11 @@ typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)(
* the external rate control model.
*
* \param[in] rate_ctrl_model rate control model
- * \param[in] encode_frame_info information of the coding frame
+ * \param[in] frame_gop_index index of the frame in current gop
* \param[out] frame_decision encode decision of the coding frame
*/
typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
+ vpx_rc_model_t rate_ctrl_model, const int frame_gop_index,
vpx_rc_encodeframe_decision_t *frame_decision);
/*!\brief Update encode frame result callback prototype
@@ -472,12 +466,10 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
* the external rate control model.
*
* \param[in] rate_ctrl_model rate control model
- * \param[in] gop_info information collected from the encoder
* \param[out] gop_decision GOP decision from the model
*/
typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)(
- vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
- vpx_rc_gop_decision_t *gop_decision);
+ vpx_rc_model_t rate_ctrl_model, vpx_rc_gop_decision_t *gop_decision);
/*!\brief Get the frame rdmult from the external rate control model.
*
diff --git a/media/libvpx/libvpx/vpx/vpx_tpl.h b/media/libvpx/libvpx/vpx/vpx_tpl.h
index a250aada60..7e4c9ab7e1 100644
--- a/media/libvpx/libvpx/vpx/vpx_tpl.h
+++ b/media/libvpx/libvpx/vpx/vpx_tpl.h
@@ -32,19 +32,21 @@ extern "C" {
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
-#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/
+#define VPX_TPL_ABI_VERSION (3) /**<\hideinitializer*/
/*!\brief Temporal dependency model stats for each block before propagation */
typedef struct VpxTplBlockStats {
- int16_t row; /**< Pixel row of the top left corner */
- int16_t col; /**< Pixel col of the top left corner */
- int64_t intra_cost; /**< Intra cost */
- int64_t inter_cost; /**< Inter cost */
- int16_t mv_r; /**< Motion vector row */
- int16_t mv_c; /**< Motion vector col */
- int64_t recrf_rate; /**< Rate from reconstructed ref frame */
- int64_t recrf_dist; /**< Distortion from reconstructed ref frame */
- int ref_frame_index; /**< Ref frame index in the ref frame buffer */
+ int16_t row; /**< Pixel row of the top left corner */
+ int16_t col; /**< Pixel col of the top left corner */
+ int64_t intra_cost; /**< Intra cost */
+ int64_t inter_cost; /**< Inter cost */
+ int16_t mv_r; /**< Motion vector row */
+ int16_t mv_c; /**< Motion vector col */
+ int64_t srcrf_rate; /**< Rate from source ref frame */
+ int64_t srcrf_dist; /**< Distortion from source ref frame */
+ int64_t inter_pred_err; /**< Inter prediction error */
+ int64_t intra_pred_err; /**< Intra prediction error */
+ int ref_frame_index; /**< Ref frame index in the ref frame buffer */
} VpxTplBlockStats;
/*!\brief Temporal dependency model stats for each frame before propagation */
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
index 683df5797a..f8b94620d4 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -168,40 +168,40 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
\
if (xoffset == 0) { \
if (yoffset == 0) { \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp[w * h]; \
highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
h); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
src_stride, h, yoffset); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} \
} else if (xoffset == 4) { \
uint16_t tmp0[w * (h + 1)]; \
if (yoffset == 0) { \
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * (h + 1)]; \
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
(h + 1)); \
highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * (h + 1)]; \
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
(h + 1)); \
highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} else { \
@@ -209,21 +209,21 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
if (yoffset == 0) { \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
xoffset); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
(h + 1), xoffset); \
highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
(h + 1), xoffset); \
highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} \
@@ -430,22 +430,22 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
} while (--i != 0);
}
-#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
- uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t tmp0[w * (h + 1)]; \
- uint16_t tmp1[w * h]; \
- uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
- \
- highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
- xoffset); \
- highbd_avg_pred_var_filter_block2d_bil_w##w( \
- tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
- \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
- CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ \
+ return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+ w, ref, ref_stride, sse); \
}
#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
@@ -460,19 +460,19 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
if (yoffset == 0) { \
highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
highbd_avg_pred_var_filter_block2d_avg( \
src_ptr, tmp, source_stride, source_stride, w, h, \
CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} else { \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
src_ptr, tmp, source_stride, source_stride, h, yoffset, \
CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} \
} else if (xoffset == 4) { \
@@ -481,7 +481,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
highbd_avg_pred_var_filter_block2d_avg( \
src_ptr, tmp0, source_stride, 1, w, h, \
CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * (h + 1)]; \
@@ -489,7 +489,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
(h + 1)); \
highbd_avg_pred_var_filter_block2d_avg( \
tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * (h + 1)]; \
@@ -497,7 +497,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
(h + 1)); \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} else { \
@@ -506,7 +506,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
highbd_avg_pred_var_filter_block2d_bil_w##w( \
src_ptr, tmp0, source_stride, 1, h, xoffset, \
CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * h]; \
@@ -514,7 +514,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
(h + 1), xoffset); \
highbd_avg_pred_var_filter_block2d_avg( \
tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * h]; \
@@ -522,7 +522,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
(h + 1), xoffset); \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} \
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
new file mode 100644
index 0000000000..cebe06b099
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h) {
+ uint64x2_t sse = vdupq_n_u64(0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ uint16x8_t diff = vabdq_u16(s, r);
+
+ sse = vpx_dotq_u16(sse, diff, diff);
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ return (uint32_t)horizontal_add_uint64x2(sse);
+}
+
+#define HIGHBD_MSE_WXH_SVE(w, h) \
+ uint32_t vpx_highbd_10_mse##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint32_t sse_tmp = \
+ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h); \
+ sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 4); \
+ *sse = sse_tmp; \
+ return sse_tmp; \
+ } \
+ \
+ uint32_t vpx_highbd_12_mse##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint32_t sse_tmp = \
+ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h); \
+ sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 8); \
+ *sse = sse_tmp; \
+ return sse_tmp; \
+ }
+
+HIGHBD_MSE_WXH_SVE(16, 16)
+HIGHBD_MSE_WXH_SVE(16, 8)
+HIGHBD_MSE_WXH_SVE(8, 16)
+HIGHBD_MSE_WXH_SVE(8, 8)
+
+#undef HIGHBD_MSE_WXH_SVE
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h, uint64_t *sse,
+ int64_t *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ do {
+ const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+ const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s16 = vaddq_s16(sum_s16, diff);
+ sse_s64 = vpx_dotq_s16(sse_s64, diff, diff);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ h -= 2;
+ } while (h != 0);
+
+ *sum = horizontal_add_int16x8(sum_s16);
+ *sse = horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h, uint64_t *sse,
+ int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ do {
+ const uint16x8_t s = vld1q_u16(src_ptr);
+ const uint16x8_t r = vld1q_u16(ref_ptr);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s32 = vpadalq_s16(sum_s32, diff);
+ sse_s64 = vpx_dotq_s16(sse_s64, diff, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ const uint16x8_t s0 = vld1q_u16(src_ptr);
+ const uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+
+ const uint16x8_t r0 = vld1q_u16(ref_ptr);
+ const uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+ const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1));
+
+ sum_s32[0] = vpadalq_s16(sum_s32[0], diff0);
+ sum_s32[1] = vpadalq_s16(sum_s32[1], diff1);
+
+ sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0);
+ sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+ sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+
+ *sum = horizontal_add_int32x4(sum_s32[0]);
+ *sse = horizontal_add_int64x2(sse_s64[0]);
+}
+
+static INLINE void highbd_variance_wxh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+ int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+ vdupq_n_s64(0) };
+
+ do {
+ int i = 0;
+ do {
+ const uint16x8_t s0 = vld1q_u16(src_ptr + i);
+ const uint16x8_t s1 = vld1q_u16(src_ptr + i + 8);
+ const uint16x8_t s2 = vld1q_u16(src_ptr + i + 16);
+ const uint16x8_t s3 = vld1q_u16(src_ptr + i + 24);
+
+ const uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+ const uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+ const uint16x8_t r2 = vld1q_u16(ref_ptr + i + 16);
+ const uint16x8_t r3 = vld1q_u16(ref_ptr + i + 24);
+
+ const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1));
+ const int16x8_t diff2 = vreinterpretq_s16_u16(vsubq_u16(s2, r2));
+ const int16x8_t diff3 = vreinterpretq_s16_u16(vsubq_u16(s3, r3));
+
+ sum_s32[0] = vpadalq_s16(sum_s32[0], diff0);
+ sum_s32[1] = vpadalq_s16(sum_s32[1], diff1);
+ sum_s32[2] = vpadalq_s16(sum_s32[2], diff2);
+ sum_s32[3] = vpadalq_s16(sum_s32[3], diff3);
+
+ sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0);
+ sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1);
+ sse_s64[2] = vpx_dotq_s16(sse_s64[2], diff2, diff2);
+ sse_s64[3] = vpx_dotq_s16(sse_s64[3], diff3, diff3);
+
+ i += 32;
+ } while (i < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+ sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]);
+ sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[2]);
+
+ sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+ sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]);
+ sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[2]);
+
+ *sum = horizontal_add_int32x4(sum_s32[0]);
+ *sse = horizontal_add_int64x2(sse_s64[0]);
+}
+
+static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+#define HBD_VARIANCE_WXH_SVE(w, h) \
+ uint32_t vpx_highbd_8_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ sum = (int)sum_long; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+HBD_VARIANCE_WXH_SVE(4, 4)
+HBD_VARIANCE_WXH_SVE(4, 8)
+
+HBD_VARIANCE_WXH_SVE(8, 4)
+HBD_VARIANCE_WXH_SVE(8, 8)
+HBD_VARIANCE_WXH_SVE(8, 16)
+
+HBD_VARIANCE_WXH_SVE(16, 8)
+HBD_VARIANCE_WXH_SVE(16, 16)
+HBD_VARIANCE_WXH_SVE(16, 32)
+
+HBD_VARIANCE_WXH_SVE(32, 16)
+HBD_VARIANCE_WXH_SVE(32, 32)
+HBD_VARIANCE_WXH_SVE(32, 64)
+
+HBD_VARIANCE_WXH_SVE(64, 32)
+HBD_VARIANCE_WXH_SVE(64, 64)
+
+#define HIGHBD_GET_VAR_SVE(s) \
+ void vpx_highbd_8_get##s##x##s##var_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ *sum = (int)sum_long; \
+ } \
+ \
+ void vpx_highbd_10_get##s##x##s##var_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ } \
+ \
+ void vpx_highbd_12_get##s##x##s##var_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ }
+
+HIGHBD_GET_VAR_SVE(8)
+HIGHBD_GET_VAR_SVE(16)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index 47684473ca..b5a944d299 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -14,86 +14,51 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
-static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
- int16x4_t *const s0, int16x4_t *const s1,
- int16x4_t *const s2, int16x4_t *const s3) {
- *s0 = vld1_s16(s);
- s += p;
- *s1 = vld1_s16(s);
- s += p;
- *s2 = vld1_s16(s);
- s += p;
- *s3 = vld1_s16(s);
-}
-
-static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
- uint16x8_t *const s0, uint16x8_t *const s1,
- uint16x8_t *const s2, uint16x8_t *const s3) {
- *s0 = vld1q_u16(s);
- s += p;
- *s1 = vld1q_u16(s);
- s += p;
- *s2 = vld1q_u16(s);
- s += p;
- *s3 = vld1q_u16(s);
-}
-
-static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
- int16x8_t *const s0, int16x8_t *const s1,
- int16x8_t *const s2, int16x8_t *const s3,
- int16x8_t *const s4, int16x8_t *const s5,
- int16x8_t *const s6, int16x8_t *const s7) {
- *s0 = vld1q_s16(s);
- s += p;
- *s1 = vld1q_s16(s);
- s += p;
- *s2 = vld1q_s16(s);
- s += p;
- *s3 = vld1q_s16(s);
- s += p;
- *s4 = vld1q_s16(s);
- s += p;
- *s5 = vld1q_s16(s);
- s += p;
- *s6 = vld1q_s16(s);
- s += p;
- *s7 = vld1q_s16(s);
+static INLINE uint16x4_t highbd_convolve4_4(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t filters, const uint16x4_t max) {
+ int32x4_t sum = vmull_lane_s16(s0, filters, 0);
+ sum = vmlal_lane_s16(sum, s1, filters, 1);
+ sum = vmlal_lane_s16(sum, s2, filters, 2);
+ sum = vmlal_lane_s16(sum, s3, filters, 3);
+
+ uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+ return vmin_u16(res, max);
}
-static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
- const uint16x8_t s0, const uint16x8_t s1,
- const uint16x8_t s2, const uint16x8_t s3,
- const uint16x8_t s4, const uint16x8_t s5,
- const uint16x8_t s6, const uint16x8_t s7) {
- vst1q_u16(s, s0);
- s += p;
- vst1q_u16(s, s1);
- s += p;
- vst1q_u16(s, s2);
- s += p;
- vst1q_u16(s, s3);
- s += p;
- vst1q_u16(s, s4);
- s += p;
- vst1q_u16(s, s5);
- s += p;
- vst1q_u16(s, s6);
- s += p;
- vst1q_u16(s, s7);
+static INLINE uint16x8_t highbd_convolve4_8(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x4_t filters, const uint16x8_t max) {
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters, 3);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters, 3);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+ return vminq_u16(res, max);
}
-static INLINE int32x4_t highbd_convolve8_4(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
+static INLINE uint16x4_t
+highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filters, const uint16x4_t max) {
const int16x4_t filters_lo = vget_low_s16(filters);
const int16x4_t filters_hi = vget_high_s16(filters);
- int32x4_t sum;
- sum = vmull_lane_s16(s0, filters_lo, 0);
+ int32x4_t sum = vmull_lane_s16(s0, filters_lo, 0);
sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
@@ -101,7 +66,9 @@ static INLINE int32x4_t highbd_convolve8_4(
sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
- return sum;
+
+ uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+ return vmin_u16(res, max);
}
static INLINE uint16x8_t
@@ -111,10 +78,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t filters, const uint16x8_t max) {
const int16x4_t filters_lo = vget_low_s16(filters);
const int16x4_t filters_hi = vget_high_s16(filters);
- int32x4_t sum0, sum1;
- uint16x8_t d;
- sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
@@ -122,7 +87,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
- sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
@@ -130,9 +96,152 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
- d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
- d = vminq_u16(d, max);
- return d;
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+ return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve_4tap_horiz_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 =
+ highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], filter, max);
+ uint16x4_t d1 =
+ highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], filter, max);
+ uint16x4_t d2 =
+ highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], filter, max);
+ uint16x4_t d3 =
+ highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x8_t d0 =
+ highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], filter, max);
+ uint16x8_t d1 =
+ highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], filter, max);
+ uint16x8_t d2 =
+ highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], filter, max);
+ uint16x8_t d3 =
+ highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+static INLINE void highbd_convolve_8tap_horiz_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x4_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], s0[6], s0[7], filter, max);
+ uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], s1[6], s1[7], filter, max);
+ uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], s2[6], s2[7], filter, max);
+ uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], s3[6], s3[7], filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], s0[6], s0[7], filter, max);
+ uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], s1[6], s1[7], filter, max);
+ uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], s2[6], s2[7], filter, max);
+ uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], s3[6], s3[7], filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
}
void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
@@ -143,202 +252,25 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
if (x_step_q4 != 16) {
vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
- } else {
- const int16x8_t filters = vld1q_s16(filter[x0_q4]);
- const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
- uint16x8_t t0, t1, t2, t3;
-
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
-
- src -= 3;
-
- if (h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int32x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23;
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u16_8x4(&t0, &t1, &t2, &t3);
- s0 = vreinterpret_s16_u16(vget_low_u16(t0));
- s1 = vreinterpret_s16_u16(vget_low_u16(t1));
- s2 = vreinterpret_s16_u16(vget_low_u16(t2));
- s3 = vreinterpret_s16_u16(vget_low_u16(t3));
- s4 = vreinterpret_s16_u16(vget_high_u16(t0));
- s5 = vreinterpret_s16_u16(vget_high_u16(t1));
- s6 = vreinterpret_s16_u16(vget_high_u16(t2));
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- src += 7;
-
- do {
- load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
- transpose_s16_4x4d(&s7, &s8, &s9, &s10);
-
- d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ return;
+ }
- d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
- d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
- d01 = vminq_u16(d01, max);
- d23 = vminq_u16(d23, max);
- transpose_u16_4x4q(&d01, &d23);
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+ assert(x_step_q4 == 16);
- vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
- vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
- vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- src += 4;
- dst += 4;
- w -= 4;
- } while (w > 0);
- } else {
- int16x8_t t4, t5, t6, t7;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3;
-
- if (w == 4) {
- do {
- load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
- &s5, &s6, &s7);
- transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
- load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
- &t4, &t5, &t6, &t7);
- src += 8 * src_stride;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
- transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- d3 =
- highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
- transpose_u16_8x4(&d0, &d1, &d2, &d3);
- vst1_u16(dst, vget_low_u16(d0));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d1));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d2));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d3));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d0));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d1));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d2));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d3));
- dst += dst_stride;
- h -= 8;
- } while (h > 0);
- } else {
- int width;
- const uint16_t *s;
- uint16_t *d;
- int16x8_t s11, s12, s13, s14;
- uint16x8_t d4, d5, d6, d7;
-
- do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
- &s5, &s6, &s7);
- transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
- width = w;
- s = src + 7;
- d = dst;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
-
- do {
- load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
- &s12, &s13, &s14);
- transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
-
- d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
- max);
- d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
- max);
- d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
- max);
- d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
- max);
- d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
- max);
- d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
- max);
- d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
- max);
- d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
- filters, max);
-
- transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
- store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 8 * src_stride;
- dst += 8 * dst_stride;
- h -= 8;
- } while (h > 0);
- }
- }
+ if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+ const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
+ highbd_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+ x_filter_4tap, bd);
+ } else {
+ const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+ highbd_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+ x_filter_8tap, bd);
}
}
@@ -352,66 +284,233 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
bd);
+ return;
+ }
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x4_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], s0[6], s0[7], filters, max);
+ uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], s1[6], s1[7], filters, max);
+ uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], s2[6], s2[7], filters, max);
+ uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], s3[6], s3[7], filters, max);
+
+ d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+ d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+ d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+ d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], s0[6], s0[7], filters, max);
+ uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], s1[6], s1[7], filters, max);
+ uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], s2[6], s2[7], filters, max);
+ uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], s3[6], s3[7], filters, max);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+static INLINE void highbd_convolve_4tap_vert_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t s0, s1, s2;
+ load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+
+ s += 3 * src_stride;
+
+ do {
+ int16x4_t s3, s4, s5, s6;
+ load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+ uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, filter, max);
+ uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, filter, max);
+ uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, filter, max);
+ uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
} else {
- const int16x8_t filters = vld1q_s16(filter[x0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
-
- src -= 3;
-
- if (h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int32x4_t d0, d1, d2, d3;
- uint16x8_t t0, t1, t2, t3;
- uint16x8_t d01, d23, t01, t23;
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u16_8x4(&t0, &t1, &t2, &t3);
- s0 = vreinterpret_s16_u16(vget_low_u16(t0));
- s1 = vreinterpret_s16_u16(vget_low_u16(t1));
- s2 = vreinterpret_s16_u16(vget_low_u16(t2));
- s3 = vreinterpret_s16_u16(vget_low_u16(t3));
- s4 = vreinterpret_s16_u16(vget_high_u16(t0));
- s5 = vreinterpret_s16_u16(vget_high_u16(t1));
- s6 = vreinterpret_s16_u16(vget_high_u16(t2));
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- src += 7;
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t s0, s1, s2;
+ load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+
+ s += 3 * src_stride;
do {
- load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
- transpose_s16_4x4d(&s7, &s8, &s9, &s10);
-
- d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
- t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
- t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
- t01 = vminq_u16(t01, max);
- t23 = vminq_u16(t23, max);
- transpose_u16_4x4q(&t01, &t23);
-
- d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
- vld1_u16(dst + 2 * dst_stride));
- d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
- vld1_u16(dst + 3 * dst_stride));
- d01 = vrhaddq_u16(d01, t01);
- d23 = vrhaddq_u16(d23, t23);
-
- vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
- vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
- vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+ int16x8_t s3, s4, s5, s6;
+ load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+ uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, filter, max);
+ uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, filter, max);
+ uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, filter, max);
+ uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void highbd_convolve_8tap_vert_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 =
+ highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
+ uint16x4_t d1 =
+ highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
+ uint16x4_t d2 =
+ highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
+ uint16x4_t d3 =
+ highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
+ uint16x8_t d1 =
+ highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
+ uint16x8_t d2 =
+ highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
+ uint16x8_t d3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -420,164 +519,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
s4 = s8;
s5 = s9;
s6 = s10;
- src += 4;
- dst += 4;
- w -= 4;
- } while (w > 0);
- } else {
- int16x8_t t4, t5, t6, t7;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
-
- if (w == 4) {
- do {
- load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
- &s5, &s6, &s7);
- transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
- load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
- &t4, &t5, &t6, &t7);
- src += 8 * src_stride;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
- transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- t3 =
- highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
- transpose_u16_8x4(&t0, &t1, &t2, &t3);
-
- d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
- vld1_u16(dst + 4 * dst_stride));
- d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
- vld1_u16(dst + 5 * dst_stride));
- d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
- vld1_u16(dst + 6 * dst_stride));
- d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
- vld1_u16(dst + 7 * dst_stride));
- d0 = vrhaddq_u16(d0, t0);
- d1 = vrhaddq_u16(d1, t1);
- d2 = vrhaddq_u16(d2, t2);
- d3 = vrhaddq_u16(d3, t3);
-
- vst1_u16(dst, vget_low_u16(d0));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d1));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d2));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d3));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d0));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d1));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d2));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d3));
- dst += dst_stride;
- h -= 8;
- } while (h > 0);
- } else {
- int width;
- const uint16_t *s;
- uint16_t *d;
- int16x8_t s11, s12, s13, s14;
- uint16x8_t d4, d5, d6, d7;
-
- do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
- &s5, &s6, &s7);
- transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
- width = w;
- s = src + 7;
- d = dst;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
-
- do {
- load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
- &s12, &s13, &s14);
- transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
-
- d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
- max);
- d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
- max);
- d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
- max);
- d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
- max);
- d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
- max);
- d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
- max);
- d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
- max);
- d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
- filters, max);
-
- transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
- d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
- d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
- d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
- d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
- d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
- d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
- d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
- d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
-
- store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 8 * src_stride;
- dst += 8 * dst_stride;
- h -= 8;
- } while (h > 0);
- }
- }
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
}
}
@@ -589,160 +538,25 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
if (y_step_q4 != 16) {
vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
x_step_q4, y0_q4, y_step_q4, w, h, bd);
- } else {
- const int16x8_t filters = vld1q_s16(filter[y0_q4]);
- const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
-
- src -= 3 * src_stride;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int32x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23;
-
- s0 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s1 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s2 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s3 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s4 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s5 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s6 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
+ return;
+ }
- do {
- s7 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s8 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s9 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s10 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
- d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
- d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
- d01 = vminq_u16(d01, max);
- d23 = vminq_u16(d23, max);
- vst1_u16(dst, vget_low_u16(d01));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d01));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d23));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d23));
- dst += dst_stride;
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+ assert(y_step_q4 == 16);
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- h -= 4;
- } while (h > 0);
- } else {
- int height;
- const uint16_t *s;
- uint16_t *d;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
- do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- s = src;
- s0 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s1 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s2 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s3 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s4 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s5 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s6 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- d = dst;
- height = h;
-
- do {
- s7 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s8 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s9 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s10 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
- d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- d3 =
- highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
- vst1q_u16(d, d0);
- d += dst_stride;
- vst1q_u16(d, d1);
- d += dst_stride;
- vst1q_u16(d, d2);
- d += dst_stride;
- vst1q_u16(d, d3);
- d += dst_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- height -= 4;
- } while (height > 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w > 0);
- }
+ if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+ const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2);
+ highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst,
+ dst_stride, w, h, y_filter_4tap, bd);
+ } else {
+ const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+ highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
+ dst_stride, w, h, y_filter_8tap, bd);
}
}
@@ -756,78 +570,89 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
bd);
+ return;
+ }
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 =
+ highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ uint16x4_t d1 =
+ highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ uint16x4_t d2 =
+ highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ uint16x4_t d3 =
+ highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+ d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+ d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+ d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+ d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
} else {
- const int16x8_t filters = vld1q_s16(filter[y0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
-
- src -= 3 * src_stride;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int32x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23, t01, t23;
-
- s0 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s1 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s2 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s3 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s4 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s5 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s6 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ s += 7 * src_stride;
do {
- s7 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s8 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s9 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s10 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
- t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
- t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
- t01 = vminq_u16(t01, max);
- t23 = vminq_u16(t23, max);
-
- d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
- vld1_u16(dst + 1 * dst_stride));
- d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
- vld1_u16(dst + 3 * dst_stride));
- d01 = vrhaddq_u16(d01, t01);
- d23 = vrhaddq_u16(d23, t23);
-
- vst1_u16(dst, vget_low_u16(d01));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d01));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d23));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d23));
- dst += dst_stride;
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ uint16x8_t d1 =
+ highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ uint16x8_t d2 =
+ highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ uint16x8_t d3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -836,96 +661,592 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
s4 = s8;
s5 = s9;
s6 = s10;
- h -= 4;
- } while (h > 0);
- } else {
- int height;
- const uint16_t *s;
- uint16_t *d;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
- do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- s = src;
- s0 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s1 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s2 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s3 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s4 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s5 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s6 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- d = dst;
- height = h;
-
- do {
- s7 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s8 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s9 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s10 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
- t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- t3 =
- highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
- d0 = vld1q_u16(d + 0 * dst_stride);
- d1 = vld1q_u16(d + 1 * dst_stride);
- d2 = vld1q_u16(d + 2 * dst_stride);
- d3 = vld1q_u16(d + 3 * dst_stride);
- d0 = vrhaddq_u16(d0, t0);
- d1 = vrhaddq_u16(d1, t1);
- d2 = vrhaddq_u16(d2, t2);
- d3 = vrhaddq_u16(d3, t3);
-
- vst1q_u16(d, d0);
- d += dst_stride;
- vst1q_u16(d, d1);
- d += dst_stride;
- vst1q_u16(d, d2);
- d += dst_stride;
- vst1q_u16(d, d3);
- d += dst_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- height -= 4;
- } while (height > 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w > 0);
- }
+static INLINE void highbd_convolve_2d_4tap_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+ const int16x4_t y_filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t h_s0[4], h_s1[4], h_s2[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+ int16x4_t v_s0 = vreinterpret_s16_u16(
+ highbd_convolve4_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
+ int16x4_t v_s1 = vreinterpret_s16_u16(
+ highbd_convolve4_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
+ int16x4_t v_s2 = vreinterpret_s16_u16(
+ highbd_convolve4_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
+
+ s += 3 * src_stride;
+
+ do {
+ int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+ &h_s3[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+ &h_s4[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+ &h_s5[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+ &h_s6[3]);
+
+ int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4(
+ h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
+ int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4(
+ h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
+ int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4(
+ h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
+ int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4(
+ h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
+
+ uint16x4_t d0 = highbd_convolve4_4(v_s0, v_s1, v_s2, v_s3, y_filter, max);
+ uint16x4_t d1 = highbd_convolve4_4(v_s1, v_s2, v_s3, v_s4, y_filter, max);
+ uint16x4_t d2 = highbd_convolve4_4(v_s2, v_s3, v_s4, v_s5, y_filter, max);
+ uint16x4_t d3 = highbd_convolve4_4(v_s3, v_s4, v_s5, v_s6, y_filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ return;
+ }
+
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t h_s0[4], h_s1[4], h_s2[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+ int16x8_t v_s0 = vreinterpretq_s16_u16(
+ highbd_convolve4_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
+ int16x8_t v_s1 = vreinterpretq_s16_u16(
+ highbd_convolve4_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
+ int16x8_t v_s2 = vreinterpretq_s16_u16(
+ highbd_convolve4_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
+
+ s += 3 * src_stride;
+
+ do {
+ int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+ &h_s3[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+ &h_s4[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+ &h_s5[3]);
+ load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+ &h_s6[3]);
+
+ int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8(
+ h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
+ int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8(
+ h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
+ int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8(
+ h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
+ int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8(
+ h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
+
+ uint16x8_t d0 = highbd_convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter, max);
+ uint16x8_t d1 = highbd_convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter, max);
+ uint16x8_t d2 = highbd_convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter, max);
+ uint16x8_t d3 = highbd_convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+}
+
+static INLINE void highbd_convolve_2d_8tap_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter,
+ const int16x8_t y_filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+ &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+ &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+ &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+ &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+ load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+ &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+ load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+ &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+ load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+ &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+ int16x4_t v_s0 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+ h_s0[6], h_s0[7], x_filter, max));
+ int16x4_t v_s1 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+ h_s1[6], h_s1[7], x_filter, max));
+ int16x4_t v_s2 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+ h_s2[6], h_s2[7], x_filter, max));
+ int16x4_t v_s3 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+ h_s3[6], h_s3[7], x_filter, max));
+ int16x4_t v_s4 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+ h_s4[6], h_s4[7], x_filter, max));
+ int16x4_t v_s5 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+ h_s5[6], h_s5[7], x_filter, max));
+ int16x4_t v_s6 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+ h_s6[6], h_s6[7], x_filter, max));
+
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+ &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+ &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+ &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+ &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+ int16x4_t v_s7 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+ h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+ int16x4_t v_s8 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+ h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+ int16x4_t v_s9 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+ h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+ int16x4_t v_s10 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+ h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+ uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+ v_s6, v_s7, y_filter, max);
+ uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+ v_s7, v_s8, y_filter, max);
+ uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+ v_s8, v_s9, y_filter, max);
+ uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+ v_s9, v_s10, y_filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ v_s3 = v_s7;
+ v_s4 = v_s8;
+ v_s5 = v_s9;
+ v_s6 = v_s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ return;
+ }
+
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+ &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+ &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+ &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+ &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+ load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+ &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+ load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+ &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+ load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+ &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+ int16x8_t v_s0 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+ h_s0[6], h_s0[7], x_filter, max));
+ int16x8_t v_s1 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+ h_s1[6], h_s1[7], x_filter, max));
+ int16x8_t v_s2 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+ h_s2[6], h_s2[7], x_filter, max));
+ int16x8_t v_s3 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+ h_s3[6], h_s3[7], x_filter, max));
+ int16x8_t v_s4 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+ h_s4[6], h_s4[7], x_filter, max));
+ int16x8_t v_s5 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+ h_s5[6], h_s5[7], x_filter, max));
+ int16x8_t v_s6 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+ h_s6[6], h_s6[7], x_filter, max));
+
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+ &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+ &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+ &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+ &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+ int16x8_t v_s7 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+ h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+ int16x8_t v_s8 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+ h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+ int16x8_t v_s9 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+ h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+ int16x8_t v_s10 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+ h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+ uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+ v_s6, v_s7, y_filter, max);
+ uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+ v_s7, v_s8, y_filter, max);
+ uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+ v_s8, v_s9, y_filter, max);
+ uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+ v_s9, v_s10, y_filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ v_s3 = v_s7;
+ v_s4 = v_s8;
+ v_s5 = v_s9;
+ v_s6 = v_s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+}
+
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
+ if (x_step_q4 != 16 || y_step_q4 != 16) {
+ vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ return;
+ }
+
+ const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+ const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+ // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+ // lines post both horizontally and vertically.
+ const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+ const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+ if (x_filter_taps == 4 && y_filter_taps == 4) {
+ const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+ const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2);
+
+ highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride,
+ dst, dst_stride, w, h, x_filter, y_filter, bd);
+ return;
+ }
+
+ const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride,
+ dst, dst_stride, w, h, x_filter, y_filter, bd);
+}
+
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16 || y_step_q4 != 16) {
+ vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ return;
}
+
+ // Averaging convolution always uses an 8-tap filter.
+ const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
+ const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
+ // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
+ // lines post both horizontally and vertically.
+ src = src - horiz_offset - vert_offset;
+
+ const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+ &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+ &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+ &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+ &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+ load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+ &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+ load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+ &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+ load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+ &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+ int16x4_t v_s0 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+ h_s0[6], h_s0[7], x_filter, max));
+ int16x4_t v_s1 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+ h_s1[6], h_s1[7], x_filter, max));
+ int16x4_t v_s2 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+ h_s2[6], h_s2[7], x_filter, max));
+ int16x4_t v_s3 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+ h_s3[6], h_s3[7], x_filter, max));
+ int16x4_t v_s4 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+ h_s4[6], h_s4[7], x_filter, max));
+ int16x4_t v_s5 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+ h_s5[6], h_s5[7], x_filter, max));
+ int16x4_t v_s6 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+ h_s6[6], h_s6[7], x_filter, max));
+
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+ &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+ &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+ &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+ &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+ int16x4_t v_s7 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+ h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+ int16x4_t v_s8 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+ h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+ int16x4_t v_s9 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+ h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+ int16x4_t v_s10 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+ h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+ uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+ v_s6, v_s7, y_filter, max);
+ uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+ v_s7, v_s8, y_filter, max);
+ uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+ v_s8, v_s9, y_filter, max);
+ uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+ v_s9, v_s10, y_filter, max);
+
+ d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+ d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+ d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+ d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ v_s3 = v_s7;
+ v_s4 = v_s8;
+ v_s5 = v_s9;
+ v_s6 = v_s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ return;
+ }
+
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+ &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+ &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+ &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+ &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+ load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+ &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+ load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+ &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+ load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+ &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+ int16x8_t v_s0 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+ h_s0[6], h_s0[7], x_filter, max));
+ int16x8_t v_s1 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+ h_s1[6], h_s1[7], x_filter, max));
+ int16x8_t v_s2 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+ h_s2[6], h_s2[7], x_filter, max));
+ int16x8_t v_s3 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+ h_s3[6], h_s3[7], x_filter, max));
+ int16x8_t v_s4 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+ h_s4[6], h_s4[7], x_filter, max));
+ int16x8_t v_s5 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+ h_s5[6], h_s5[7], x_filter, max));
+ int16x8_t v_s6 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+ h_s6[6], h_s6[7], x_filter, max));
+
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+ &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+ &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+ &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+ &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+ int16x8_t v_s7 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+ h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+ int16x8_t v_s8 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+ h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+ int16x8_t v_s9 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+ h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+ int16x8_t v_s10 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+ h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+ uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+ v_s6, v_s7, y_filter, max);
+ uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+ v_s7, v_s8, y_filter, max);
+ uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+ v_s8, v_s9, y_filter, max);
+ uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+ v_s9, v_s10, y_filter, max);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ v_s3 = v_s7;
+ v_s4 = v_s8;
+ v_s5 = v_s9;
+ v_s6 = v_s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
new file mode 100644
index 0000000000..7fc0a57c90
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6,
+ 1, 3, 5, 7 };
+
+static INLINE uint16x4_t highbd_convolve4_4(const int16x4_t s[4],
+ const int16x8_t filter,
+ const uint16x4_t max) {
+ int16x8_t s01 = vcombine_s16(s[0], s[1]);
+ int16x8_t s23 = vcombine_s16(s[2], s[3]);
+
+ int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s01, filter, 0);
+ int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s23, filter, 0);
+
+ int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+ uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS);
+ return vmin_u16(res_u16, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8(const int16x8_t s[4],
+ const int16x8_t filter,
+ const uint16x8_t max,
+ uint16x8_t idx) {
+ int64x2_t sum04 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
+ int64x2_t sum15 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
+ int64x2_t sum26 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
+ int64x2_t sum37 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
+
+ int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
+ int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+ vqrshrun_n_s32(res1, FILTER_BITS));
+
+ res = vpx_tbl_u16(res, idx);
+
+ return vminq_u16(res, max);
+}
+
+static INLINE uint16x4_t highbd_convolve8_4(const int16x8_t s[4],
+ const int16x8_t filter,
+ const uint16x4_t max) {
+ int64x2_t sum[4];
+
+ sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter);
+ sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter);
+ sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter);
+ sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter);
+
+ sum[0] = vpaddq_s64(sum[0], sum[1]);
+ sum[2] = vpaddq_s64(sum[2], sum[3]);
+
+ int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
+
+ uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS);
+ return vmin_u16(res_u16, max);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8(const int16x8_t s[8],
+ const int16x8_t filter,
+ const uint16x8_t max) {
+ int64x2_t sum[8];
+
+ sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter);
+ sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter);
+ sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter);
+ sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter);
+ sum[4] = vpx_dotq_s16(vdupq_n_s64(0), s[4], filter);
+ sum[5] = vpx_dotq_s16(vdupq_n_s64(0), s[5], filter);
+ sum[6] = vpx_dotq_s16(vdupq_n_s64(0), s[6], filter);
+ sum[7] = vpx_dotq_s16(vdupq_n_s64(0), s[7], filter);
+
+ int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]);
+ int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]);
+ int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]);
+ int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]);
+
+ int32x4_t res0 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+ int32x4_t res1 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+ vqrshrun_n_s32(res1, FILTER_BITS));
+ return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve_4tap_horiz_sve(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t filters, int bd) {
+ const int16x8_t filter = vcombine_s16(filters, vdup_n_s16(0));
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve4_4(s0, filter, max);
+ uint16x4_t d1 = highbd_convolve4_4(s1, filter, max);
+ uint16x4_t d2 = highbd_convolve4_4(s2, filter, max);
+ uint16x4_t d3 = highbd_convolve4_4(s3, filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const uint16x8_t idx = vld1q_u16(kTblConv4_8);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x8_t d0 = highbd_convolve4_8(s0, filter, max, idx);
+ uint16x8_t d1 = highbd_convolve4_8(s1, filter, max, idx);
+ uint16x8_t d2 = highbd_convolve4_8(s2, filter, max, idx);
+ uint16x8_t d3 = highbd_convolve4_8(s3, filter, max, idx);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+static INLINE void highbd_convolve_8tap_horiz_sve(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x8_t filters, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x8_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+ uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+ uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+ uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+ uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+ uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+ uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+void vpx_highbd_convolve8_horiz_sve(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ return;
+ }
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+ const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
+ highbd_convolve_4tap_horiz_sve(src - 1, src_stride, dst, dst_stride, w, h,
+ x_filter_4tap, bd);
+ } else {
+ const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+ highbd_convolve_8tap_horiz_sve(src - 3, src_stride, dst, dst_stride, w, h,
+ x_filter_8tap, bd);
+ }
+}
+
+void vpx_highbd_convolve8_avg_horiz_sve(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+ return;
+ }
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x8_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+ uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+ uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+ uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+ d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+ d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+ d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+ d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+ uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+ uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+ uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
new file mode 100644
index 0000000000..4ed7718f7d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h"
+
+// clang-format off
+DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = {
+ // Shift left and insert new last column in transposed 4x4 block.
+ 1, 2, 3, 0, 5, 6, 7, 4,
+ // Shift left and insert two new columns in transposed 4x4 block.
+ 2, 3, 0, 1, 6, 7, 4, 5,
+ // Shift left and insert three new columns in transposed 4x4 block.
+ 3, 0, 1, 2, 7, 4, 5, 6,
+};
+// clang-format on
+
+static INLINE void transpose_concat_4x4(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ int16x8_t res[2]) {
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03
+ // s1: 10, 11, 12, 13
+ // s2: 20, 21, 22, 23
+ // s3: 30, 31, 32, 33
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+
+ int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
+ int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
+ int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
+ int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
+
+ int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
+ int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
+
+ int32x4x2_t t0123 = vzipq_s32(s01, s23);
+
+ res[0] = vreinterpretq_s16_s32(t0123.val[0]);
+ res[1] = vreinterpretq_s16_s32(t0123.val[1]);
+}
+
+static INLINE void transpose_concat_8x4(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ int16x8_t res[4]) {
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03, 04, 05, 06, 07
+ // s1: 10, 11, 12, 13, 14, 15, 16, 17
+ // s2: 20, 21, 22, 23, 24, 25, 26, 27
+ // s3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+ // res[2]: 04 14 24 34 05 15 25 35
+ // res[3]: 06 16 26 36 07 17 27 37
+
+ int16x8x2_t s01 = vzipq_s16(s0, s1);
+ int16x8x2_t s23 = vzipq_s16(s2, s3);
+
+ int32x4x2_t t0123_lo = vzipq_s32(vreinterpretq_s32_s16(s01.val[0]),
+ vreinterpretq_s32_s16(s23.val[0]));
+ int32x4x2_t t0123_hi = vzipq_s32(vreinterpretq_s32_s16(s01.val[1]),
+ vreinterpretq_s32_s16(s23.val[1]));
+
+ res[0] = vreinterpretq_s16_s32(t0123_lo.val[0]);
+ res[1] = vreinterpretq_s16_s32(t0123_lo.val[1]);
+ res[2] = vreinterpretq_s16_s32(t0123_hi.val[0]);
+ res[3] = vreinterpretq_s16_s32(t0123_hi.val[1]);
+}
+
+static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4],
+ int16x8_t res[4], uint16x8_t idx) {
+ res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
+ res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
+ res[2] = vpx_tbl2_s16(s0[2], s1[2], idx);
+ res[3] = vpx_tbl2_s16(s0[3], s1[3], idx);
+}
+
+static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2],
+ int16x8_t res[2], uint16x8_t idx) {
+ res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
+ res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t s_lo[2],
+ int16x8_t s_hi[2],
+ int16x8_t filter,
+ uint16x4_t max) {
+ int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
+ sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
+
+ int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
+ sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
+
+ int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+ uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
+ return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_v(const int16x8_t s_lo[4],
+ const int16x8_t s_hi[4],
+ const int16x8_t filter,
+ const uint16x8_t max) {
+ int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
+ sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
+
+ int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
+ sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
+
+ int64x2_t sum45 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[2], filter, 0);
+ sum45 = vpx_dotq_lane_s16(sum45, s_hi[2], filter, 1);
+
+ int64x2_t sum67 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[3], filter, 0);
+ sum67 = vpx_dotq_lane_s16(sum67, s_hi[3], filter, 1);
+
+ int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+ int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
+ vqrshrun_n_s32(sum4567, FILTER_BITS));
+ return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve8_8tap_vert_sve2(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+ assert(w >= 4 && h >= 4);
+ uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
+
+ // Correct indices by the size of vector length.
+ merge_tbl_idx.val[0] = vaddq_u16(
+ merge_tbl_idx.val[0],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
+ merge_tbl_idx.val[1] = vaddq_u16(
+ merge_tbl_idx.val[1],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
+ merge_tbl_idx.val[2] = vaddq_u16(
+ merge_tbl_idx.val[2],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+ transpose_concat_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+ do {
+ int16x4_t s7, s8, s9, sA;
+
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+ int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+ transpose_concat_4x4(s7, s8, s9, sA, s789A);
+
+ vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+ vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+ vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+ uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filter, max);
+ uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filter, max);
+ uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filter, max);
+ uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0123[0] = s4567[0];
+ s0123[1] = s4567[1];
+ s1234[0] = s5678[0];
+ s1234[1] = s5678[1];
+ s2345[0] = s6789[0];
+ s2345[1] = s6789[1];
+ s3456[0] = s789A[0];
+ s3456[1] = s789A[1];
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+ transpose_concat_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+ do {
+ int16x8_t s7, s8, s9, sA;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+ int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
+ transpose_concat_8x4(s7, s8, s9, sA, s789A);
+
+ vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+ vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+ vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+ uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filter, max);
+ uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filter, max);
+ uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filter, max);
+ uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0123[0] = s4567[0];
+ s0123[1] = s4567[1];
+ s0123[2] = s4567[2];
+ s0123[3] = s4567[3];
+ s1234[0] = s5678[0];
+ s1234[1] = s5678[1];
+ s1234[2] = s5678[2];
+ s1234[3] = s5678[3];
+ s2345[0] = s6789[0];
+ s2345[1] = s6789[1];
+ s2345[2] = s6789[2];
+ s2345[3] = s6789[3];
+ s3456[0] = s789A[0];
+ s3456[1] = s789A[1];
+ s3456[2] = s789A[2];
+ s3456[3] = s789A[3];
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+void vpx_highbd_convolve8_vert_sve2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ return;
+ }
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+ assert(y_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+ vpx_highbd_convolve8_vert_neon(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+ } else {
+ const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+ highbd_convolve8_8tap_vert_sve2(src - 3 * src_stride, src_stride, dst,
+ dst_stride, w, h, y_filter_8tap, bd);
+ }
+}
+
+void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+ return;
+ }
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+ src -= 3 * src_stride;
+
+ uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
+
+ // Correct indices by the size of vector length.
+ merge_tbl_idx.val[0] = vaddq_u16(
+ merge_tbl_idx.val[0],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
+ merge_tbl_idx.val[1] = vaddq_u16(
+ merge_tbl_idx.val[1],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
+ merge_tbl_idx.val[2] = vaddq_u16(
+ merge_tbl_idx.val[2],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+ transpose_concat_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+ do {
+ int16x4_t s7, s8, s9, sA;
+
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+ int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+ transpose_concat_4x4(s7, s8, s9, sA, s789A);
+
+ vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+ vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+ vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+ uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filters, max);
+ uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filters, max);
+ uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filters, max);
+ uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filters, max);
+
+ d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+ d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+ d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+ d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0123[0] = s4567[0];
+ s0123[1] = s4567[1];
+ s1234[0] = s5678[0];
+ s1234[1] = s5678[1];
+ s2345[0] = s6789[0];
+ s2345[1] = s6789[1];
+ s3456[0] = s789A[0];
+ s3456[1] = s789A[1];
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+ transpose_concat_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+ do {
+ int16x8_t s7, s8, s9, sA;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+ int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
+ transpose_concat_8x4(s7, s8, s9, sA, s789A);
+
+ vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+ vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+ vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+ uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filters, max);
+ uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filters, max);
+ uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filters, max);
+ uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filters, max);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0123[0] = s4567[0];
+ s0123[1] = s4567[1];
+ s0123[2] = s4567[2];
+ s0123[3] = s4567[3];
+ s1234[0] = s5678[0];
+ s1234[1] = s5678[1];
+ s1234[2] = s5678[2];
+ s1234[3] = s5678[3];
+ s2345[0] = s6789[0];
+ s2345[1] = s6789[1];
+ s2345[2] = s6789[2];
+ s2345[3] = s6789[3];
+ s3456[0] = s789A[0];
+ s3456[1] = s789A[1];
+ s3456[2] = s789A[2];
+ s3456[3] = s789A[3];
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
deleted file mode 100644
index 414ade3530..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
- uint16_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4, int w,
- int h, int bd) {
- // + 1 to make it divisible by 4
- uint16_t temp[64 * 136];
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- /* Filter starting 3 lines back. The neon implementation will ignore the given
- * height and filter a multiple of 4 lines. Since this goes in to the temp
- * buffer which has lots of extra room and is subsequently discarded this is
- * safe if somewhat less than ideal. */
- vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
- filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- intermediate_height, bd);
-
- /* Step into the temp buffer 3 lines to get the actual frame data */
- vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
- uint16_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4,
- int w, int h, int bd) {
- // + 1 to make it divisible by 4
- uint16_t temp[64 * 136];
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- /* This implementation has the same issues as above. In addition, we only want
- * to average the values after both passes.
- */
- vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
- filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- intermediate_height, bd);
- vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
- bd);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
index c54e588239..579096d78a 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -162,7 +162,7 @@ FUN_FLIP_SIGN(16, q_) // flip_sign_16
#define FUN_FLIP_SIGN_BACK(w, r) \
static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
- const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \
+ const int8x##w##_t sign_bit = vdup##r##n_s8((int8_t)0x80); \
return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \
}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
index 38b0b6c1a9..268c4bd962 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
@@ -154,11 +154,10 @@ static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
ptrdiff_t stride) {
uint32_t a;
- uint32x2_t a_u32;
- if (stride == 4) return vld1_u8(buf);
+ uint32x2_t a_u32 = vdup_n_u32(0);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vdup_n_u32(a);
+ a_u32 = vset_lane_u32(a, a_u32, 0);
memcpy(&a, buf, 4);
a_u32 = vset_lane_u32(a, a_u32, 1);
return vreinterpret_u8_u32(a_u32);
@@ -177,11 +176,10 @@ static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
ptrdiff_t stride) {
uint64_t a;
- uint64x2_t a_u64;
- if (stride == 4) return vld1q_u16(buf);
+ uint64x2_t a_u64 = vdupq_n_u64(0);
memcpy(&a, buf, 8);
buf += stride;
- a_u64 = vdupq_n_u64(a);
+ a_u64 = vsetq_lane_u64(a, a_u64, 0);
memcpy(&a, buf, 8);
a_u64 = vsetq_lane_u64(a, a_u64, 1);
return vreinterpretq_u16_u64(a_u64);
@@ -191,10 +189,6 @@ static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
const uint8x8_t a) {
const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
- if (stride == 4) {
- vst1_u8(buf, a);
- return;
- }
uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
buf += stride;
uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
@@ -204,11 +198,10 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
ptrdiff_t stride) {
uint32_t a;
- uint32x4_t a_u32;
- if (stride == 4) return vld1q_u8(buf);
+ uint32x4_t a_u32 = vdupq_n_u32(0);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vdupq_n_u32(a);
+ a_u32 = vsetq_lane_u32(a, a_u32, 0);
memcpy(&a, buf, 4);
buf += stride;
a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -225,10 +218,6 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
const uint8x16_t a) {
const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
- if (stride == 4) {
- vst1q_u8(buf, a);
- return;
- }
uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
buf += stride;
uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
@@ -449,6 +438,142 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
vst1q_u8(s, s7);
}
+static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p,
+ const uint16x4_t s0, const uint16x4_t s1,
+ const uint16x4_t s2) {
+ vst1_u16(s, s0);
+ s += p;
+ vst1_u16(s, s1);
+ s += p;
+ vst1_u16(s, s2);
+}
+
+static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+ int16x4_t *s3) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p,
+ const uint16x4_t s0, const uint16x4_t s1,
+ const uint16x4_t s2, const uint16x4_t s3) {
+ vst1_u16(s, s0);
+ s += p;
+ vst1_u16(s, s1);
+ s += p;
+ vst1_u16(s, s2);
+ s += p;
+ vst1_u16(s, s3);
+}
+
+static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+ int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+ int16x4_t *s6) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+}
+
+static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+ int16x8_t *s3) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+ uint16x8_t *s3) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3) {
+ vst1q_u16(s, s0);
+ s += p;
+ vst1q_u16(s, s1);
+ s += p;
+ vst1q_u16(s, s2);
+ s += p;
+ vst1q_u16(s, s3);
+}
+
+static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2) {
+ vst1q_u16(s, s0);
+ s += p;
+ vst1q_u16(s, s1);
+ s += p;
+ vst1q_u16(s, s2);
+}
+
+static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+ int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+ int16x8_t *s6) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+}
+
static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
@@ -470,4 +595,46 @@ static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
*s7 = vld1q_u16(s);
}
+static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+ int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+ int16x4_t *s6, int16x4_t *s7) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+ int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+ int16x8_t *s6, int16x8_t *s7) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+}
+
#endif // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
new file mode 100644
index 0000000000..a18cbbd736
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size) {
+ if (size == 4) {
+ int16x4_t s[4];
+ int64x2_t sum = vdupq_n_s64(0);
+
+ s[0] = vld1_s16(src + 0 * stride);
+ s[1] = vld1_s16(src + 1 * stride);
+ s[2] = vld1_s16(src + 2 * stride);
+ s[3] = vld1_s16(src + 3 * stride);
+
+ int16x8_t s01 = vcombine_s16(s[0], s[1]);
+ int16x8_t s23 = vcombine_s16(s[2], s[3]);
+
+ sum = vpx_dotq_s16(sum, s01, s01);
+ sum = vpx_dotq_s16(sum, s23, s23);
+
+ return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum));
+ } else {
+ int rows = size;
+ int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+ vdupq_n_s64(0) };
+
+ do {
+ const int16_t *src_ptr = src;
+ int cols = size;
+
+ do {
+ int16x8_t s[8];
+ load_s16_8x8(src_ptr, stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+
+ sum[0] = vpx_dotq_s16(sum[0], s[0], s[0]);
+ sum[1] = vpx_dotq_s16(sum[1], s[1], s[1]);
+ sum[2] = vpx_dotq_s16(sum[2], s[2], s[2]);
+ sum[3] = vpx_dotq_s16(sum[3], s[3], s[3]);
+ sum[0] = vpx_dotq_s16(sum[0], s[4], s[4]);
+ sum[1] = vpx_dotq_s16(sum[1], s[5], s[5]);
+ sum[2] = vpx_dotq_s16(sum[2], s[6], s[6]);
+ sum[3] = vpx_dotq_s16(sum[3], s[7], s[7]);
+
+ src_ptr += 8;
+ cols -= 8;
+ } while (cols);
+
+ src += 8 * stride;
+ rows -= 8;
+ } while (rows);
+
+ sum[0] = vaddq_s64(sum[0], sum[1]);
+ sum[2] = vaddq_s64(sum[2], sum[3]);
+ sum[0] = vaddq_s64(sum[0], sum[2]);
+
+ return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum[0]));
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
index 74f85a6bb6..c989a6721b 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -524,12 +524,20 @@ static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
*a7 = vreinterpretq_s32_s64(c3.val[1]);
}
-// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
-// 'q' registers here to save some instructions.
static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
uint8x8_t *a6, uint8x8_t *a7) {
- // Swap 8 bit elements. Goes from:
+ // Widen to 128-bit registers (usually a no-op once inlined.)
+ const uint8x16_t a0q = vcombine_u8(*a0, vdup_n_u8(0));
+ const uint8x16_t a1q = vcombine_u8(*a1, vdup_n_u8(0));
+ const uint8x16_t a2q = vcombine_u8(*a2, vdup_n_u8(0));
+ const uint8x16_t a3q = vcombine_u8(*a3, vdup_n_u8(0));
+ const uint8x16_t a4q = vcombine_u8(*a4, vdup_n_u8(0));
+ const uint8x16_t a5q = vcombine_u8(*a5, vdup_n_u8(0));
+ const uint8x16_t a6q = vcombine_u8(*a6, vdup_n_u8(0));
+ const uint8x16_t a7q = vcombine_u8(*a7, vdup_n_u8(0));
+
+ // Zip 8 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 10 11 12 13 14 15 16 17
// a2: 20 21 22 23 24 25 26 27
@@ -539,43 +547,41 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
// a6: 60 61 62 63 64 65 66 67
// a7: 70 71 72 73 74 75 76 77
// to:
- // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
- // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
- // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
- // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
-
- const uint8x16x2_t b0 =
- vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
- const uint8x16x2_t b1 =
- vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
-
- // Swap 16 bit elements resulting in:
- // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
- // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
- // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
- // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
-
- const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
- vreinterpretq_u16_u8(b1.val[0]));
- const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
- vreinterpretq_u16_u8(b1.val[1]));
-
- // Unzip 32 bit elements resulting in:
+ // b0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // b1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // b2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // b3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const uint8x16_t b0 = vzipq_u8(a0q, a1q).val[0];
+ const uint8x16_t b1 = vzipq_u8(a2q, a3q).val[0];
+ const uint8x16_t b2 = vzipq_u8(a4q, a5q).val[0];
+ const uint8x16_t b3 = vzipq_u8(a6q, a7q).val[0];
+
+ // Zip 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // c0.val[1]: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // c1.val[0]: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // c1.val[1]: 44 54 64 74 45 55 65 75 46 66 56 76 47 67 57 77
+ const uint16x8x2_t c0 =
+ vzipq_u16(vreinterpretq_u16_u8(b0), vreinterpretq_u16_u8(b1));
+ const uint16x8x2_t c1 =
+ vzipq_u16(vreinterpretq_u16_u8(b2), vreinterpretq_u16_u8(b3));
+
+ // Zip 32 bit elements resulting in:
// d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d0.val[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[0]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
// d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
- const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ const uint32x4x2_t d0 = vzipq_u32(vreinterpretq_u32_u16(c0.val[0]),
vreinterpretq_u32_u16(c1.val[0]));
- const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ const uint32x4x2_t d1 = vzipq_u32(vreinterpretq_u32_u16(c0.val[1]),
vreinterpretq_u32_u16(c1.val[1]));
*a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
*a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
- *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
- *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
- *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
- *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *a2 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ *a3 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *a4 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ *a5 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
*a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
*a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index 65fb67c984..037ea1142d 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -20,44 +20,36 @@
#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
-// Note:
-// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
-// 2. After refactoring the shared code in kernel loops with inline functions,
-// the decoder speed dropped a lot when using gcc compiler. Therefore there is
-// no refactoring for those parts by now.
-// 3. For horizontal convolve, there is an alternative optimization that
-// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
-// samples in each are read from memory: src, (src+1), (src+2), (src+3),
-// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
-// instructions. This optimization is much faster in speed unit test, but slowed
-// down the whole decoder by 5%.
-
-static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride, int w,
- int h, const int16x4_t filter) {
+static INLINE void convolve_4tap_horiz_neon(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h,
+ const int16x8_t filter) {
+ // 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const uint8x8_t x_filter =
+ vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
+
+ // Neon does not have lane-referencing multiply or multiply-accumulate
+ // instructions that operate on vectors of 8-bit elements. This means we have
+ // to duplicate filter taps into a whole vector and use standard multiply /
+ // multiply-accumulate instructions.
+ const uint8x8_t filter_taps[4] = { vdup_lane_u8(x_filter, 2),
+ vdup_lane_u8(x_filter, 3),
+ vdup_lane_u8(x_filter, 4),
+ vdup_lane_u8(x_filter, 5) };
+
if (w == 4) {
do {
- int16x4_t s0[4], s1[4];
-
- int16x8_t t0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src)));
- s0[0] = vget_low_s16(vextq_s16(t0, t0, 0));
- s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
- s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
- s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
+ uint8x8_t s01[4];
- int16x8_t t1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + src_stride)));
- s1[0] = vget_low_s16(vextq_s16(t1, t1, 0));
- s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
- s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
- s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
+ s01[0] = load_unaligned_u8(src + 0, src_stride);
+ s01[1] = load_unaligned_u8(src + 1, src_stride);
+ s01[2] = load_unaligned_u8(src + 2, src_stride);
+ s01[3] = load_unaligned_u8(src + 3, src_stride);
- int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
- int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
- uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter_taps);
- store_u8(dst, dst_stride, d01);
+ store_unaligned_u8(dst, dst_stride, d01);
src += 2 * src_stride;
dst += 2 * dst_stride;
@@ -70,25 +62,20 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
int width = w;
do {
- int16x8_t t0[2], t1[2];
- int16x8_t s0[4], s1[4];
-
- t0[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- t0[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 8)));
- s0[0] = vextq_s16(t0[0], t0[1], 0);
- s0[1] = vextq_s16(t0[0], t0[1], 1);
- s0[2] = vextq_s16(t0[0], t0[1], 2);
- s0[3] = vextq_s16(t0[0], t0[1], 3);
-
- t1[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride)));
- t1[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride + 8)));
- s1[0] = vextq_s16(t1[0], t1[1], 0);
- s1[1] = vextq_s16(t1[0], t1[1], 1);
- s1[2] = vextq_s16(t1[0], t1[1], 2);
- s1[3] = vextq_s16(t1[0], t1[1], 3);
-
- uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
- uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);
+ uint8x8_t s0[4], s1[4];
+
+ s0[0] = vld1_u8(s + 0);
+ s0[1] = vld1_u8(s + 1);
+ s0[2] = vld1_u8(s + 2);
+ s0[3] = vld1_u8(s + 3);
+
+ s1[0] = vld1_u8(s + src_stride + 0);
+ s1[1] = vld1_u8(s + src_stride + 1);
+ s1[2] = vld1_u8(s + src_stride + 2);
+ s1[3] = vld1_u8(s + src_stride + 3);
+
+ uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter_taps);
+ uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter_taps);
vst1_u8(d, d0);
vst1_u8(d + dst_stride, d1);
@@ -103,47 +90,41 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
}
}
-static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride, int w,
- int h, const int16x8_t filter) {
- uint8x8_t t0, t1, t2, t3;
-
+static INLINE void convolve_8tap_horiz_neon(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h,
+ const int16x8_t filter) {
if (h == 4) {
- uint8x8_t d01, d23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+ uint8x8_t t0, t1, t2, t3;
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
src += 7;
do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ transpose_u8_8x4(&t7, &t8, &t9, &t10);
+ int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+ int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
transpose_u8_4x4(&d01, &d23);
@@ -162,52 +143,33 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
w -= 4;
} while (w != 0);
} else {
- int width;
- const uint8_t *s;
- uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
if (w == 4) {
do {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
&t7);
- src += 8 * src_stride;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
+
transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
- d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
- d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
- d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
transpose_u8_8x4(&d04, &d15, &d26, &d37);
@@ -216,57 +178,53 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+ src += 8 * src_stride;
dst += 8 * dst_stride;
h -= 8;
} while (h > 0);
} else {
- uint8_t *d;
- uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
- int16x8_t s11, s12, s13, s14;
-
do {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- width = w;
- s = src + 7;
- d = dst;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ const uint8_t *s = src + 7;
+ uint8_t *d = dst;
+ int width = w;
do {
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
- d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
- d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
- d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
- d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
+ uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+ load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+ &t15);
+
+ transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+ uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+ uint8x8_t d6 =
+ convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+ uint8x8_t d7 =
+ convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
@@ -304,17 +262,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
(void)y0_q4;
(void)y_step_q4;
+ const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+
if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements.
- */
- const int16x4_t x_filter_4tap = vshr_n_s16(vld1_s16(filter[x0_q4] + 2), 1);
- vpx_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
- x_filter_4tap);
+ convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+ x_filter);
} else {
- const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
- vpx_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
- x_filter_8tap);
+ convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+ x_filter);
}
}
@@ -324,7 +279,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
const int16x8_t filters = vld1q_s16(filter[x0_q4]);
- uint8x8_t t0, t1, t2, t3;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
@@ -337,48 +291,41 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
src -= 3;
if (h == 4) {
- uint8x8_t d01, d23, dd01, dd23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
+ uint8x8_t t0, t1, t2, t3;
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
src += 7;
do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ transpose_u8_8x4(&t7, &t8, &t9, &t10);
+ int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+ int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
transpose_u8_4x4(&d01, &d23);
- dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
- dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -398,61 +345,40 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
w -= 4;
} while (w != 0);
} else {
- int width;
- const uint8_t *s;
- uint8x8_t t4, t5, t6, t7;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
if (w == 4) {
- uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37;
-
do {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
&t7);
- src += 8 * src_stride;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
+
transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
transpose_u8_8x4(&d04, &d15, &d26, &d37);
- dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
- dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
- dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
- dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
+ uint8x8_t dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
+ uint8x8_t dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
+ uint8x8_t dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
+ uint8x8_t dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
d04 = vrhadd_u8(d04, dd04);
d15 = vrhadd_u8(d15, dd15);
@@ -464,65 +390,54 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+ src += 8 * src_stride;
dst += 8 * dst_stride;
h -= 8;
} while (h != 0);
} else {
- uint8_t *d;
- uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
- int16x8_t s11, s12, s13, s14;
-
do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- width = w;
- s = src + 7;
- d = dst;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ const uint8_t *s = src + 7;
+ uint8_t *d = dst;
+ int width = w;
do {
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
- d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
- d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
- d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+ uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+ load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+ &t15);
+
+ transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ uint8x8_t d5 =
+ convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ uint8x8_t d6 =
+ convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ uint8x8_t d7 =
+ convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
@@ -556,152 +471,37 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
-static INLINE void vpx_convolve_4tap_vert_neon(const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride, int w,
- int h, const int16x4_t filter) {
- if (w == 4) {
- uint8x8_t t0, t1, t2, t3, d01, d23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, d0, d1, d2, d3;
-
- load_u8_8x3(src, src_stride, &t0, &t1, &t2);
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-
- src += 3 * src_stride;
-
- do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
-
- d0 = convolve4_4(s0, s1, s2, s3, filter);
- d1 = convolve4_4(s1, s2, s3, s4, filter);
- d2 = convolve4_4(s2, s3, s4, s5, filter);
- d3 = convolve4_4(s3, s4, s5, s6, filter);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- } else {
- int height;
- const uint8_t *s;
- uint8_t *d;
- uint8x8_t t0, t1, t2, t3, d0, d1, d2, d3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6;
-
- do {
- load_u8_8x3(src, src_stride, &t0, &t1, &t2);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-
- s = src + 3 * src_stride;
- d = dst;
- height = h;
-
- do {
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
-
- d0 = convolve4_8(s0, s1, s2, s3, filter);
- d1 = convolve4_8(s1, s2, s3, s4, filter);
- d2 = convolve4_8(s2, s3, s4, s5, filter);
- d3 = convolve4_8(s3, s4, s5, s6, filter);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride, int w,
- int h, const int16x8_t filter) {
+static INLINE void convolve_8tap_vert_neon(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h,
+ const int16x8_t filter) {
if (w == 4) {
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
src += 7 * src_stride;
do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
-
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+ int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -718,54 +518,33 @@ static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src,
h -= 4;
} while (h != 0);
} else {
- int height;
- const uint8_t *s;
- uint8_t *d;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- s = src + 7 * src_stride;
- d = dst;
- height = h;
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ const uint8_t *s = src + 7 * src_stride;
+ uint8_t *d = dst;
+ int height = h;
do {
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
-
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -800,17 +579,14 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
(void)x_step_q4;
(void)y_step_q4;
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements.
- */
- const int16x4_t y_filter_4tap = vshr_n_s16(vld1_s16(filter[y0_q4] + 2), 1);
- vpx_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride,
- w, h, y_filter_4tap);
+ convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+ y_filter);
} else {
- const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
- vpx_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
- dst_stride, w, h, y_filter_8tap);
+ convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, dst_stride,
+ w, h, y_filter);
}
}
@@ -832,45 +608,35 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
src -= 3 * src_stride;
if (w == 4) {
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
src += 7 * src_stride;
do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
-
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
- dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
- dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+ int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -890,54 +656,33 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
h -= 4;
} while (h != 0);
} else {
- int height;
- const uint8_t *s;
- uint8_t *d;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- s = src + 7 * src_stride;
- d = dst;
- height = h;
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ const uint8_t *s = src + 7 * src_stride;
+ uint8_t *d = dst;
+ int height = h;
do {
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
-
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
index 4ecaee0f99..10cc761ccd 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -17,360 +17,6 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/vpx_filter.h"
-#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
- ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4,
- int y_step_q4, int w, int h);
-
-static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples,
- const int32x4_t correction,
- const int8x8_t filters) {
- /* Accumulate dot product into 'correction' to account for range clamp. */
- int32x4_t sum = vdotq_lane_s32(correction, samples, filters, 0);
-
- /* Further narrowing and packing is performed by the caller. */
- return vmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve4_4_sdot(const uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16_t permute_tbl) {
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- int8x16_t clamped_samples =
- vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0);
-
- /* Further narrowing and packing is performed by the caller. */
- return vmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo,
- const int8x16_t samples_hi,
- const int32x4_t correction,
- const int8x8_t filters) {
- /* Sample range-clamping and permutation are performed by the caller. */
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- int32x4_t sum0 = vdotq_lane_s32(correction, samples_lo, filters, 0);
- /* Second 4 output values. */
- int32x4_t sum1 = vdotq_lane_s32(correction, samples_hi, filters, 0);
-
- /* Narrow and re-pack. */
- int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- /* We halved the filter values so -1 from right shift. */
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE uint8x8_t convolve4_8_sdot(const uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[2];
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- int32x4_t sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- /* Second 4 output values. */
- int32x4_t sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-
- /* Narrow and re-pack. */
- int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- /* We halved the filter values so -1 from right shift. */
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
- const int8x16_t samples_hi,
- const int32x4_t correction,
- const int8x8_t filters) {
- /* Sample range-clamping and permutation are performed by the caller. */
- int32x4_t sum;
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
- sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve8_4_sdot(const uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[2];
- int32x4_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
- const int8x16_t samples0_hi,
- const int8x16_t samples1_lo,
- const int8x16_t samples1_hi,
- const int32x4_t correction,
- const int8x8_t filters) {
- /* Sample range-clamping and permutation are performed by the caller. */
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0);
- sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
- /* Second 4 output values. */
- sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0);
- sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot(const uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
- /* Second 4 output values. */
- sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
- sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4,
- int w, int h);
-
-static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples,
- const int8x8_t filters) {
- int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0);
-
- /* Further narrowing and packing is performed by the caller. */
- return vmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve4_4_usdot(const uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16_t permute_tbl) {
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
-
- int32x4_t sum =
- vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
-
- /* Further narrowing and packing is performed by the caller. */
- return vmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo,
- const uint8x16_t samples_hi,
- const int8x8_t filters) {
- /* Sample permutation is performed by the caller. */
- /* First 4 output values. */
- int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
- /* Second 4 output values. */
- int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples_hi, filters, 0);
-
- /* Narrow and re-pack. */
- int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- /* We halved the filter values so -1 from right shift. */
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE uint8x8_t convolve4_8_usdot(const uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16x2_t permute_tbl) {
- uint8x16_t permuted_samples[2];
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
- /* First 4 output values. */
- int32x4_t sum0 =
- vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
- /* Second 4 output values. */
- int32x4_t sum1 =
- vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
-
- /* Narrow and re-pack. */
- int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- /* We halved the filter values so -1 from right shift. */
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
- const uint8x16_t samples_hi,
- const int8x8_t filters) {
- /* Sample permutation is performed by the caller. */
- int32x4_t sum;
-
- sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
- sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16x2_t permute_tbl) {
- uint8x16_t permuted_samples[2];
- int32x4_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
- sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
- sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
- const uint8x16_t samples0_hi,
- const uint8x16_t samples1_lo,
- const uint8x16_t samples1_hi,
- const int8x8_t filters) {
- /* Sample permutation is performed by the caller. */
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* First 4 output values. */
- sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
- sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
- /* Second 4 output values. */
- sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
- sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16x3_t permute_tbl) {
- uint8x16_t permuted_samples[3];
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
- /* First 4 output values. */
- sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
- sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
- /* Second 4 output values. */
- sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
- sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t filters) {
- int16x4_t sum = vmul_lane_s16(s0, filters, 0);
- sum = vmla_lane_s16(sum, s1, filters, 1);
- sum = vmla_lane_s16(sum, s2, filters, 2);
- sum = vmla_lane_s16(sum, s3, filters, 3);
- return sum;
-}
-
-static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x4_t filters) {
- int16x8_t sum = vmulq_lane_s16(s0, filters, 0);
- sum = vmlaq_lane_s16(sum, s1, filters, 1);
- sum = vmlaq_lane_s16(sum, s2, filters, 2);
- sum = vmlaq_lane_s16(sum, s3, filters, 3);
- /* We halved the filter values so -1 from right shift. */
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
@@ -428,4 +74,99 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
filters);
}
+// 2-tap (bilinear) filter values are always positive, but 4-tap filter values
+// are negative on the outer edges (taps 0 and 3), with taps 1 and 2 having much
+// greater positive values to compensate. To use instructions that operate on
+// 8-bit types we also need the types to be unsigned. Subtracting the products
+// of taps 0 and 3 from the products of taps 1 and 2 always works given that
+// 2-tap filters are 0-padded.
+static INLINE uint8x8_t convolve4_8(const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3,
+ const uint8x8_t filter_taps[4]) {
+ uint16x8_t sum = vmull_u8(s1, filter_taps[1]);
+ sum = vmlal_u8(sum, s2, filter_taps[2]);
+ sum = vmlsl_u8(sum, s0, filter_taps[0]);
+ sum = vmlsl_u8(sum, s3, filter_taps[3]);
+ // We halved the filter values so -1 from right shift.
+ return vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_BITS - 1);
+}
+
+static INLINE void convolve_4tap_vert_neon(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h,
+ const int16x8_t filter) {
+ // 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const uint8x8_t y_filter =
+ vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
+
+ // Neon does not have lane-referencing multiply or multiply-accumulate
+ // instructions that operate on vectors of 8-bit elements. This means we have
+ // to duplicate filter taps into a whole vector and use standard multiply /
+ // multiply-accumulate instructions.
+ const uint8x8_t filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+ vdup_lane_u8(y_filter, 3),
+ vdup_lane_u8(y_filter, 4),
+ vdup_lane_u8(y_filter, 5) };
+
+ if (w == 4) {
+ uint8x8_t s01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+ uint8x8_t s12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+
+ src += 2 * src_stride;
+
+ do {
+ uint8x8_t s23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+ uint8x8_t s34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+ uint8x8_t s45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
+ uint8x8_t s56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
+
+ uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter_taps);
+ uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter_taps);
+
+ store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ s01 = s45;
+ s12 = s56;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+
+ uint8x8_t s0, s1, s2;
+ load_u8_8x3(s, src_stride, &s0, &s1, &s2);
+
+ s += 3 * src_stride;
+
+ do {
+ uint8x8_t s3, s4, s5, s6;
+ load_u8_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+ uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter_taps);
+ uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter_taps);
+ uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter_taps);
+ uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter_taps);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
index 00bac3b9cf..b05a49d3fe 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -20,270 +20,139 @@
#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
+// Filter values always sum to 128.
+#define FILTER_SUM 128
+
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
};
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
- 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
- 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
- /* Shift left and insert new last column in transposed 4x4 block. */
+ // Shift left and insert new last column in transposed 4x4 block.
1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
- /* Shift left and insert two new columns in transposed 4x4 block. */
+ // Shift left and insert two new columns in transposed 4x4 block.
2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
- /* Shift left and insert three new columns in transposed 4x4 block. */
+ // Shift left and insert three new columns in transposed 4x4 block.
3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
};
-static INLINE void vpx_convolve_4tap_2d_horiz_neon_dotprod(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x16_t range_limit) {
- uint8x16_t s0, s1, s2, s3;
-
- if (w == 4) {
- const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
- d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8_4x1(dst + 2 * dst_stride, d23);
- } else {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16_t permute_tbl) {
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t samples_128 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
- d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
- store_u8_8x3(d, dst_stride, d0, d1, d2);
+ // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
+ // by 2 since we halved the filter values.)
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
+ int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- }
+ // Further narrowing and packing is performed by the caller.
+ return vmovn_s32(sum);
}
-static INLINE void vpx_convolve_8tap_2d_horiz_neon_dotprod(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x16_t range_limit) {
- uint8x16_t s0, s1, s2, s3;
-
- if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
- d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8_4x1(dst + 2 * dst_stride, d23);
- } else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
- d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-
- store_u8_8x3(d, dst_stride, d0, d1, d2);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- }
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t samples_128 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+ vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+ // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
+ // by 2 since we halved the filter values.)
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
+ // First 4 output values.
+ int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+ // Second 4 output values.
+ int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+ // We halved the filter values so -1 from right shift.
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
- ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4,
- int y_step_q4, int w, int h) {
- const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
- const int32x4_t correction_8tap =
- vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
- const uint8x16_t range_limit = vdupq_n_u8(128);
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
- assert(x_step_q4 == 16);
-
- (void)x_step_q4;
- (void)y0_q4;
- (void)y_step_q4;
-
- if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
- const int8x8_t x_filter_4tap =
- vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
- const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
- vpx_convolve_4tap_2d_horiz_neon_dotprod(src - 1, src_stride, dst,
- dst_stride, w, h, x_filter_4tap,
- correction_4tap, range_limit);
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t samples_128 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+ vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+ // Accumulate into 128 * FILTER_SUM to account for range transform.
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+ int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+ sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1);
+
+ // Further narrowing and packing is performed by the caller.
+ return vshrn_n_s32(sum, 1);
+}
- } else {
- vpx_convolve_8tap_2d_horiz_neon_dotprod(src - 3, src_stride, dst,
- dst_stride, w, h, x_filter_8tap,
- correction_8tap, range_limit);
- }
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x3_t permute_tbl) {
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t samples_128 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+ vqtbl1q_s8(samples_128, permute_tbl.val[1]),
+ vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
+
+ // Accumulate into 128 * FILTER_SUM to account for range transform.
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+ // First 4 output values.
+ int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+ sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1);
+ // Second 4 output values.
+ int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+ sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
+static INLINE void convolve_4tap_horiz_neon_dotprod(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x16_t range_limit) {
- uint8x16_t s0, s1, s2, s3;
-
+ ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
if (w == 4) {
- const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- t1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- t2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- t3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+ int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+ int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+ int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+ int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -293,23 +162,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+ uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -324,26 +191,22 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
}
}
-static INLINE void vpx_convolve_8tap_horiz_neon_dotprod(
+static INLINE void convolve_8tap_horiz_neon_dotprod(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x16_t range_limit) {
- uint8x16_t s0, s1, s2, s3;
-
+ ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+ int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
+ int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -353,23 +216,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_dotprod(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -389,11 +250,6 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
- const int32x4_t correction_8tap =
- vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
- const uint8x16_t range_limit = vdupq_n_u8(128);
-
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
assert(x_step_q4 == 16);
@@ -403,21 +259,21 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
(void)y_step_q4;
if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
+ // Load 4-tap filter into first 4 elements of the vector.
+ // All 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
const int8x8_t x_filter_4tap =
- vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
- const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
- vpx_convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride,
- w, h, x_filter_4tap, correction_4tap,
- range_limit);
+ vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+ convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, w, h,
+ x_filter_4tap);
} else {
- vpx_convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride,
- w, h, x_filter_8tap, correction_8tap,
- range_limit);
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+ convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, w, h,
+ x_filter_8tap);
}
}
@@ -428,10 +284,6 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
int x_step_q4, int y0_q4,
int y_step_q4, int w, int h) {
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
- const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
- const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
- const uint8x16_t range_limit = vdupq_n_u8(128);
- uint8x16_t s0, s1, s2, s3;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
@@ -444,22 +296,21 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
src -= 3;
if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23, dd01, dd23;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
- t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
- t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
- t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+ int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
+ int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
+ int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
+ int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
- dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
- dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -472,24 +323,23 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
h -= 4;
} while (h != 0);
} else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
- d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
- d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
- d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+ uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
+ uint8x8_t dd0, dd1, dd2, dd3;
load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
d0 = vrhadd_u8(d0, dd0);
@@ -511,260 +361,142 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
}
static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
- int8x8_t a3, int8x16_t *b,
- const uint8x16_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, XX, XX, XX, XX
- * a1: 10, 11, 12, 13, XX, XX, XX, XX
- * a2: 20, 21, 22, 23, XX, XX, XX, XX
- * a3: 30, 31, 32, 33, XX, XX, XX, XX
- *
- * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
- *b = vqtbl2q_s8(samples, permute_tbl);
+ int8x8_t a3, int8x16_t *b) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, XX, XX, XX, XX
+ // a1: 10, 11, 12, 13, XX, XX, XX, XX
+ // a2: 20, 21, 22, 23, XX, XX, XX, XX
+ // a3: 30, 31, 32, 33, XX, XX, XX, XX
+ //
+ // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+ int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+ int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+ int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+ int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+ int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+ int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+ int16x8_t a0123 =
+ vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
+
+ *b = vreinterpretq_s8_s16(a0123);
}
static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
int8x8_t a3, int8x16_t *b0,
- int8x16_t *b1,
- const uint8x16x2_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, 04, 05, 06, 07
- * a1: 10, 11, 12, 13, 14, 15, 16, 17
- * a2: 20, 21, 22, 23, 24, 25, 26, 27
- * a3: 30, 31, 32, 33, 34, 35, 36, 37
- *
- * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
- *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
- *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+ int8x16_t *b1) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, 04, 05, 06, 07
+ // a1: 10, 11, 12, 13, 14, 15, 16, 17
+ // a2: 20, 21, 22, 23, 24, 25, 26, 27
+ // a3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+ int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+ int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+ int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+ int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+ int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+ int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+ int16x8x2_t a0123 =
+ vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
+
+ *b0 = vreinterpretq_s8_s16(a0123.val[0]);
+ *b1 = vreinterpretq_s8_s16(a0123.val[1]);
}
-static INLINE void vpx_convolve_4tap_vert_neon_dotprod(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x8_t range_limit) {
- const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
- int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int8x16x2_t samples_LUT;
-
- if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- int8x16_t s0123, s1234, s2345, s3456, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- src += 7 * src_stride;
+static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo,
+ const int8x16_t samples_hi,
+ const int8x8_t filters) {
+ // The sample range transform and permutation are performed by the caller.
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ // Accumulate into 128 * FILTER_SUM to account for range transform.
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+ int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0);
+ sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
- do {
- uint8x8_t t7, t8, t9, t10;
- load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
- d0 = convolve4_4_sdot_partial(s0123, correction, filter);
- d1 = convolve4_4_sdot_partial(s1234, correction, filter);
- d2 = convolve4_4_sdot_partial(s2345, correction, filter);
- d3 = convolve4_4_sdot_partial(s3456, correction, filter);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s0123 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- s3456 = s78910;
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- } else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
- do {
- height = h;
- s = src;
- d = dst;
-
- load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s += 7 * src_stride;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
-
- do {
- uint8x8_t t7, t8, t9, t10;
- load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
-
- d0 = convolve4_8_sdot_partial(s0123_lo, s0123_hi, correction, filter);
- d1 = convolve4_8_sdot_partial(s1234_lo, s1234_hi, correction, filter);
- d2 = convolve4_8_sdot_partial(s2345_lo, s2345_hi, correction, filter);
- d3 = convolve4_8_sdot_partial(s3456_lo, s3456_hi, correction, filter);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s0123_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- s3456_lo = s78910_lo;
-
- samples_LUT.val[0] = s3456_hi;
- samples_LUT.val[1] = s78910_hi;
- s0123_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- s3456_hi = s78910_hi;
+ // Further narrowing and packing is performed by the caller.
+ return vshrn_n_s32(sum, 1);
+}
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w != 0);
- }
+static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
+ const int8x16_t samples0_hi,
+ const int8x16_t samples1_lo,
+ const int8x16_t samples1_hi,
+ const int8x8_t filters) {
+ // The sample range transform and permutation are performed by the caller.
+
+ // Accumulate into 128 * FILTER_SUM to account for range transform.
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+ // First 4 output values.
+ int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0);
+ sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+ // Second 4 output values.
+ int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0);
+ sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-static INLINE void vpx_convolve_8tap_vert_neon_dotprod(
+static INLINE void convolve_8tap_vert_neon_dotprod(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x8_t range_limit) {
+ ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
- int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int8x16x2_t samples_LUT;
if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
src += 7 * src_stride;
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+ int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+ int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+ int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+ int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+ int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+ int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ int8x16_t s0123, s1234, s2345, s3456;
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456);
do {
uint8x8_t t7, t8, t9, t10;
-
load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+ int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+ int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+ int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+ int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+ int8x16_t s78910;
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ int8x16x2_t samples_LUT = { { s3456, s78910 } };
+ int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
- d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
- d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
- d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+ int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+ int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+ int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -781,83 +513,70 @@ static INLINE void vpx_convolve_8tap_vert_neon_dotprod(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
do {
- height = h;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
s += 7 * src_stride;
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+ int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+ int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+ int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+ int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+ int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+ int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi;
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
uint8x8_t t7, t8, t9, t10;
-
load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+ int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+ int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+ int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+ int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
+ int8x16_t s78910_lo, s78910_hi;
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+ int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
samples_LUT.val[0] = s3456_hi;
samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- correction, filter);
- d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- correction, filter);
- d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- correction, filter);
- d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- correction, filter);
+ int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ uint8x8_t d0 =
+ convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+ uint8x8_t d1 =
+ convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+ uint8x8_t d2 =
+ convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+ uint8x8_t d3 =
+ convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123_lo = s4567_lo;
s0123_hi = s4567_hi;
s1234_lo = s5678_lo;
@@ -883,11 +602,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
- const int32x4_t correction_8tap =
- vdupq_n_s32(vaddlvq_s16(vshll_n_s8(y_filter_8tap, FILTER_BITS)));
- const uint8x8_t range_limit = vdup_n_u8(128);
-
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
assert(y_step_q4 == 16);
@@ -897,20 +611,15 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
(void)y_step_q4;
if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
- const int8x8_t y_filter_4tap =
- vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
- const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
- vpx_convolve_4tap_vert_neon_dotprod(src - src_stride, src_stride, dst,
- dst_stride, w, h, y_filter_4tap,
- correction_4tap, range_limit);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+ y_filter);
} else {
- vpx_convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
- dst_stride, w, h, y_filter_8tap,
- correction_8tap, range_limit);
+ const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+ convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
+ dst_stride, w, h, y_filter);
}
}
@@ -921,13 +630,7 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
int x_step_q4, int y0_q4,
int y_step_q4, int w, int h) {
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
- const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
- const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
- const uint8x8_t range_limit = vdup_n_u8(128);
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
- int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int8x16x2_t samples_LUT;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
@@ -940,59 +643,54 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
src -= 3 * src_stride;
if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23, dd01, dd23;
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
src += 7 * src_stride;
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+ int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+ int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+ int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+ int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+ int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+ int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ int8x16_t s0123, s1234, s2345, s3456;
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456);
do {
uint8x8_t t7, t8, t9, t10;
-
load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+ int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+ int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+ int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+ int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+ int8x16_t s78910;
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ int8x16x2_t samples_LUT = { { s3456, s78910 } };
+ int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
- d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
- d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
- d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
+ int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
+ int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
+ int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
- dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
- dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -1000,8 +698,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123 = s4567;
s1234 = s5678;
s2345 = s6789;
@@ -1012,79 +710,67 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
do {
- height = h;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
s += 7 * src_stride;
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+ int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+ int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+ int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+ int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+ int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+ int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi;
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
uint8x8_t t7, t8, t9, t10;
-
load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+ int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+ int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+ int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+ int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
+ int8x16_t s78910_lo, s78910_hi;
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+ int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
samples_LUT.val[0] = s3456_hi;
samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- correction, filters);
- d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- correction, filters);
- d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- correction, filters);
- d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- correction, filters);
-
+ int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ uint8x8_t d0 =
+ convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
+ uint8x8_t d1 =
+ convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
+ uint8x8_t d2 =
+ convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
+ uint8x8_t d3 =
+ convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
+
+ uint8x8_t dd0, dd1, dd2, dd3;
load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
d0 = vrhadd_u8(d0, dd0);
@@ -1094,8 +780,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123_lo = s4567_lo;
s0123_hi = s4567_hi;
s1234_lo = s5678_lo;
@@ -1115,3 +801,275 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
} while (w != 0);
}
}
+
+static INLINE void convolve_4tap_2d_neon_dotprod(const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride, int w,
+ int h, const int8x8_t x_filter,
+ const uint8x8_t y_filter) {
+ // Neon does not have lane-referencing multiply or multiply-accumulate
+ // instructions that operate on vectors of 8-bit elements. This means we have
+ // to duplicate filter taps into a whole vector and use standard multiply /
+ // multiply-accumulate instructions.
+ const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+ vdup_lane_u8(y_filter, 3),
+ vdup_lane_u8(y_filter, 4),
+ vdup_lane_u8(y_filter, 5) };
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+ uint8x16_t h_s0, h_s1, h_s2;
+ load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+ int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
+ int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
+ int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
+
+ src += 3 * src_stride;
+
+ do {
+ uint8x16_t h_s3, h_s4, h_s5, h_s6;
+ load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+ int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
+ int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
+ int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
+ int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
+ uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
+ uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
+ uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
+
+ uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
+ uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
+
+ store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ v_s01 = v_s45;
+ v_s12 = v_s56;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+
+ uint8x16_t h_s0, h_s1, h_s2;
+ load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
+
+ uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
+ uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
+ uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
+
+ s += 3 * src_stride;
+
+ do {
+ uint8x16_t h_s3, h_s4, h_s5, h_s6;
+ load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+ uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
+ uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
+ uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
+ uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
+
+ uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
+ uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
+ uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
+ uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void convolve_8tap_2d_horiz_neon_dotprod(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+ int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 3);
+
+ // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+ // below for further details on possible values of block height.
+ uint8x16_t s0, s1, s2;
+ load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+ int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 =
+ vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8_4x1(dst + 2 * dst_stride, d23);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 3);
+
+ // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+ // below for further details on possible values of block height.
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2;
+ load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+
+ store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ }
+}
+
+void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+ const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+ // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+ // lines post both horizontally and vertically.
+ const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+ const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+ if (x_filter_taps == 4 && y_filter_taps == 4) {
+ const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ // 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const int8x8_t x_filter_4tap =
+ vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+ const uint8x8_t y_filter_4tap =
+ vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
+
+ convolve_4tap_2d_neon_dotprod(src - horiz_offset - vert_offset, src_stride,
+ dst, dst_stride, w, h, x_filter_4tap,
+ y_filter_4tap);
+ return;
+ }
+
+ // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+ // maximum buffer size to 64 * (64 + 7).
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+ const int im_stride = 64;
+ const int im_height = h + SUBPEL_TAPS - 1;
+
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+ convolve_8tap_2d_horiz_neon_dotprod(src - horiz_offset - vert_offset,
+ src_stride, im_block, im_stride, w,
+ im_height, x_filter_8tap);
+
+ convolve_8tap_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_8tap);
+}
+
+void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+ const int im_stride = 64;
+
+ // Averaging convolution always uses an 8-tap filter.
+ // Account for the vertical phase needing 3 lines prior and 4 lines post.
+ const int im_height = h + SUBPEL_TAPS - 1;
+ const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+ convolve_8tap_2d_horiz_neon_dotprod(src - offset - offset * src_stride,
+ src_stride, im_block, im_stride, w,
+ im_height, x_filter_8tap);
+
+ vpx_convolve8_avg_vert_neon_dotprod(im_block + offset * im_stride, im_stride,
+ dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
index bcad1dd121..e582004133 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -26,255 +26,112 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
};
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
- 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
- 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
- /* Shift left and insert new last column in transposed 4x4 block. */
+ // Shift left and insert new last column in transposed 4x4 block.
1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
- /* Shift left and insert two new columns in transposed 4x4 block. */
+ // Shift left and insert two new columns in transposed 4x4 block.
2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
- /* Shift left and insert three new columns in transposed 4x4 block. */
+ // Shift left and insert three new columns in transposed 4x4 block.
3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
};
-static INLINE void vpx_convolve_4tap_2d_horiz_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
- uint8x16_t s0, s1, s2, s3;
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16_t permute_tbl) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
- if (w == 4) {
- const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
+ int32x4_t sum =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve4_4_usdot(s0, filter, perm_tbl);
- d1 = convolve4_4_usdot(s1, filter, perm_tbl);
- d2 = convolve4_4_usdot(s2, filter, perm_tbl);
- d3 = convolve4_4_usdot(s3, filter, perm_tbl);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
- d0 = convolve4_4_usdot(s0, filter, perm_tbl);
- d1 = convolve4_4_usdot(s1, filter, perm_tbl);
- d2 = convolve4_4_usdot(s2, filter, perm_tbl);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8_4x1(dst + 2 * dst_stride, d23);
- } else {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve4_8_usdot(s0, filter, perm_tbl);
- d1 = convolve4_8_usdot(s1, filter, perm_tbl);
- d2 = convolve4_8_usdot(s2, filter, perm_tbl);
- d3 = convolve4_8_usdot(s3, filter, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
- d0 = convolve4_8_usdot(s0, filter, perm_tbl);
- d1 = convolve4_8_usdot(s1, filter, perm_tbl);
- d2 = convolve4_8_usdot(s2, filter, perm_tbl);
-
- store_u8_8x3(d, dst_stride, d0, d1, d2);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- }
+ // Further narrowing and packing is performed by the caller.
+ return vmovn_s32(sum);
}
-static INLINE void vpx_convolve_8tap_2d_horiz_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
- uint8x16_t s0, s1, s2, s3;
-
- if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_usdot(s0, filter, perm_tbl);
- d1 = convolve8_4_usdot(s1, filter, perm_tbl);
- d2 = convolve8_4_usdot(s2, filter, perm_tbl);
- d3 = convolve8_4_usdot(s3, filter, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
- d0 = convolve8_4_usdot(s0, filter, perm_tbl);
- d1 = convolve8_4_usdot(s1, filter, perm_tbl);
- d2 = convolve8_4_usdot(s2, filter, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8_4x1(dst + 2 * dst_stride, d23);
- } else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_usdot(s0, filter, perm_tbl);
- d1 = convolve8_8_usdot(s1, filter, perm_tbl);
- d2 = convolve8_8_usdot(s2, filter, perm_tbl);
- d3 = convolve8_8_usdot(s3, filter, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
- d0 = convolve8_8_usdot(s0, filter, perm_tbl);
- d1 = convolve8_8_usdot(s1, filter, perm_tbl);
- d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-
- store_u8_8x3(d, dst_stride, d0, d1, d2);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- }
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+ vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+ // First 4 output values.
+ int32x4_t sum0 =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ // Second 4 output values.
+ int32x4_t sum1 =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+ // We halved the filter values so -1 from right shift.
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4,
- int w, int h) {
- const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
- assert(x_step_q4 == 16);
-
- (void)x_step_q4;
- (void)y0_q4;
- (void)y_step_q4;
-
- if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
- const int8x8_t x_filter_4tap =
- vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
- vpx_convolve_4tap_2d_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride,
- w, h, x_filter_4tap);
-
- } else {
- vpx_convolve_8tap_2d_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride,
- w, h, x_filter_8tap);
- }
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+ vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+ int32x4_t sum =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+ // Further narrowing and packing is performed by the caller.
+ return vshrn_n_s32(sum, 1);
}
-static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
- uint8x16_t s0, s1, s2, s3;
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x3_t permute_tbl) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+ vqtbl1q_u8(samples, permute_tbl.val[1]),
+ vqtbl1q_u8(samples, permute_tbl.val[2]) };
+
+ // First 4 output values.
+ int32x4_t sum0 =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+ // Second 4 output values.
+ int32x4_t sum1 =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+static INLINE void convolve_4tap_horiz_neon_i8mm(const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride, int w,
+ int h, const int8x8_t filter) {
if (w == 4) {
- const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve4_4_usdot(s0, filter, perm_tbl);
- t1 = convolve4_4_usdot(s1, filter, perm_tbl);
- t2 = convolve4_4_usdot(s2, filter, perm_tbl);
- t3 = convolve4_4_usdot(s3, filter, perm_tbl);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+ int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+ int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+ int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+ int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -284,23 +141,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve4_8_usdot(s0, filter, perm_tbl);
- d1 = convolve4_8_usdot(s1, filter, perm_tbl);
- d2 = convolve4_8_usdot(s2, filter, perm_tbl);
- d3 = convolve4_8_usdot(s3, filter, perm_tbl);
+ uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -315,25 +170,24 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
}
}
-static INLINE void vpx_convolve_8tap_horiz_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
- uint8x16_t s0, s1, s2, s3;
-
+static INLINE void convolve_8tap_horiz_neon_i8mm(const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride, int w,
+ int h, const int8x8_t filter) {
if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_usdot(s0, filter, perm_tbl);
- t1 = convolve8_4_usdot(s1, filter, perm_tbl);
- t2 = convolve8_4_usdot(s2, filter, perm_tbl);
- t3 = convolve8_4_usdot(s3, filter, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+ int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
+ int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -343,23 +197,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_i8mm(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_usdot(s0, filter, perm_tbl);
- d1 = convolve8_8_usdot(s1, filter, perm_tbl);
- d2 = convolve8_8_usdot(s2, filter, perm_tbl);
- d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -379,8 +231,6 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
assert(x_step_q4 == 16);
@@ -390,18 +240,21 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
(void)y_step_q4;
if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
+ // Load 4-tap filter into first 4 elements of the vector.
+ // All 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
const int8x8_t x_filter_4tap =
- vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
- vpx_convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w,
- h, x_filter_4tap);
+ vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+ convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, h,
+ x_filter_4tap);
} else {
- vpx_convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w,
- h, x_filter_8tap);
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+ convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, h,
+ x_filter_8tap);
}
}
@@ -411,7 +264,6 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
- uint8x16_t s0, s1, s2, s3;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
@@ -424,22 +276,21 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
src -= 3;
if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23, dd01, dd23;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_usdot(s0, filters, perm_tbl);
- t1 = convolve8_4_usdot(s1, filters, perm_tbl);
- t2 = convolve8_4_usdot(s2, filters, perm_tbl);
- t3 = convolve8_4_usdot(s3, filters, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+ int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
+ int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
+ int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
+ int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
- dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
- dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -452,24 +303,23 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
h -= 4;
} while (h != 0);
} else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_usdot(s0, filters, perm_tbl);
- d1 = convolve8_8_usdot(s1, filters, perm_tbl);
- d2 = convolve8_8_usdot(s2, filters, perm_tbl);
- d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+ uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
+ uint8x8_t dd0, dd1, dd2, dd3;
load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
d0 = vrhadd_u8(d0, dd0);
@@ -492,216 +342,130 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b,
- const uint8x16_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, XX, XX, XX, XX
- * a1: 10, 11, 12, 13, XX, XX, XX, XX
- * a2: 20, 21, 22, 23, XX, XX, XX, XX
- * a3: 30, 31, 32, 33, XX, XX, XX, XX
- *
- * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
- *b = vqtbl2q_u8(samples, permute_tbl);
+ uint8x16_t *b) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, XX, XX, XX, XX
+ // a1: 10, 11, 12, 13, XX, XX, XX, XX
+ // a2: 20, 21, 22, 23, XX, XX, XX, XX
+ // a3: 30, 31, 32, 33, XX, XX, XX, XX
+ //
+ // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+ uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+ uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+ uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+ uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+ uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+ uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+ uint16x8_t a0123 =
+ vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
+
+ *b = vreinterpretq_u8_u16(a0123);
}
static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b0, uint8x16_t *b1,
- const uint8x16x2_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, 04, 05, 06, 07
- * a1: 10, 11, 12, 13, 14, 15, 16, 17
- * a2: 20, 21, 22, 23, 24, 25, 26, 27
- * a3: 30, 31, 32, 33, 34, 35, 36, 37
- *
- * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
- *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
- *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+ uint8x16_t *b0, uint8x16_t *b1) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, 04, 05, 06, 07
+ // a1: 10, 11, 12, 13, 14, 15, 16, 17
+ // a2: 20, 21, 22, 23, 24, 25, 26, 27
+ // a3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+ uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+ uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+ uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+ uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+ uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+ uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+ uint16x8x2_t a0123 =
+ vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
+
+ *b0 = vreinterpretq_u8_u16(a0123.val[0]);
+ *b1 = vreinterpretq_u8_u16(a0123.val[1]);
}
-static INLINE void vpx_convolve_4tap_vert_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
- const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x16x2_t samples_LUT;
-
- if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- uint8x16_t s0123, s1234, s2345, s3456, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- src += 7 * src_stride;
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-
- do {
- load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
- d0 = convolve4_4_usdot_partial(s0123, filter);
- d1 = convolve4_4_usdot_partial(s1234, filter);
- d2 = convolve4_4_usdot_partial(s2345, filter);
- d3 = convolve4_4_usdot_partial(s3456, filter);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s0123 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
- s3456 = s78910;
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- } else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
- do {
- height = h;
- s = src;
- d = dst;
+static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
+ const uint8x16_t samples_hi,
+ const int8x8_t filters) {
+ // Sample permutation is performed by the caller.
+ int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+ sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
- load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
-
- do {
- load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
-
- d0 = convolve4_8_usdot_partial(s0123_lo, s0123_hi, filter);
- d1 = convolve4_8_usdot_partial(s1234_lo, s1234_hi, filter);
- d2 = convolve4_8_usdot_partial(s2345_lo, s2345_hi, filter);
- d3 = convolve4_8_usdot_partial(s3456_lo, s3456_hi, filter);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s0123_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
- s3456_lo = s78910_lo;
-
- samples_LUT.val[0] = s3456_hi;
- samples_LUT.val[1] = s78910_hi;
- s0123_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
- s3456_hi = s78910_hi;
+ // Further narrowing and packing is performed by the caller.
+ return vshrn_n_s32(sum, 1);
+}
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w != 0);
- }
+static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
+ const uint8x16_t samples0_hi,
+ const uint8x16_t samples1_lo,
+ const uint8x16_t samples1_hi,
+ const int8x8_t filters) {
+ // Sample permutation is performed by the caller.
+
+ // First 4 output values.
+ int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+ // Second 4 output values.
+ int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-static INLINE void vpx_convolve_8tap_vert_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+static INLINE void convolve_8tap_vert_neon_i8mm(const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride, int w,
+ int h, const int8x8_t filter) {
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x16x2_t samples_LUT;
-
if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6;
load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
src += 7 * src_stride;
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ uint8x16_t s0123, s1234, s2345, s3456;
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456);
do {
+ uint8x8_t s7, s8, s9, s10;
load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+ uint8x16_t s78910;
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+ uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
- d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
- d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
- d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+ int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+ int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+ int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123 = s4567;
s1234 = s5678;
s2345 = s6789;
@@ -712,67 +476,56 @@ static INLINE void vpx_convolve_8tap_vert_neon_i8mm(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
do {
- height = h;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6;
load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
s += 7 * src_stride;
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi;
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
+ uint8x8_t s7, s8, s9, s10;
load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
+ uint8x16_t s78910_lo, s78910_hi;
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+ uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
samples_LUT.val[0] = s3456_hi;
samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- filter);
- d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- filter);
- d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- filter);
- d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- filter);
+ uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ uint8x8_t d0 =
+ convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+ uint8x8_t d1 =
+ convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+ uint8x8_t d2 =
+ convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+ uint8x8_t d3 =
+ convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123_lo = s4567_lo;
s0123_hi = s4567_hi;
s1234_lo = s5678_lo;
@@ -798,8 +551,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
-
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
assert(y_step_q4 == 16);
@@ -809,17 +560,15 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
(void)y_step_q4;
if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
- const int8x8_t y_filter_4tap =
- vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
- vpx_convolve_4tap_vert_neon_i8mm(src - src_stride, src_stride, dst,
- dst_stride, w, h, y_filter_4tap);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+ y_filter);
} else {
- vpx_convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
- dst_stride, w, h, y_filter_8tap);
+ const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+ convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
+ dst_stride, w, h, y_filter);
}
}
@@ -830,8 +579,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
int w, int h) {
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x16x2_t samples_LUT;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
@@ -844,43 +591,40 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
src -= 3 * src_stride;
if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23, dd01, dd23;
-
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6;
load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
src += 7 * src_stride;
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ uint8x16_t s0123, s1234, s2345, s3456;
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456);
do {
+ uint8x8_t s7, s8, s9, s10;
load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+ uint8x16_t s78910;
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+ uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
- d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
- d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
- d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
+ int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
+ int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
+ int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
- dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
- dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -888,8 +632,8 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123 = s4567;
s1234 = s5678;
s2345 = s6789;
@@ -900,63 +644,53 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
do {
- height = h;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6;
load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
s += 7 * src_stride;
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi;
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
+ uint8x8_t s7, s8, s9, s10;
load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
+ uint8x16_t s78910_lo, s78910_hi;
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+ uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
samples_LUT.val[0] = s3456_hi;
samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- filters);
- d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- filters);
- d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- filters);
- d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- filters);
-
+ uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ uint8x8_t d0 =
+ convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
+ uint8x8_t d1 =
+ convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
+ uint8x8_t d2 =
+ convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
+ uint8x8_t d3 =
+ convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
+
+ uint8x8_t dd0, dd1, dd2, dd3;
load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
d0 = vrhadd_u8(d0, dd0);
@@ -987,3 +721,275 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
} while (w != 0);
}
}
+
+static INLINE void convolve_4tap_2d_neon_i8mm(const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride, int w,
+ int h, const int8x8_t x_filter,
+ const uint8x8_t y_filter) {
+ // Neon does not have lane-referencing multiply or multiply-accumulate
+ // instructions that operate on vectors of 8-bit elements. This means we have
+ // to duplicate filter taps into a whole vector and use standard multiply /
+ // multiply-accumulate instructions.
+ const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+ vdup_lane_u8(y_filter, 3),
+ vdup_lane_u8(y_filter, 4),
+ vdup_lane_u8(y_filter, 5) };
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+ uint8x16_t h_s0, h_s1, h_s2;
+ load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+ int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
+ int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
+ int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
+
+ src += 3 * src_stride;
+
+ do {
+ uint8x16_t h_s3, h_s4, h_s5, h_s6;
+ load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+ int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
+ int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
+ int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
+ int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
+ uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
+ uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
+ uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
+
+ uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
+ uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
+
+ store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ v_s01 = v_s45;
+ v_s12 = v_s56;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+
+ uint8x16_t h_s0, h_s1, h_s2;
+ load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
+
+ uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
+ uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
+ uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
+
+ s += 3 * src_stride;
+
+ do {
+ uint8x16_t h_s3, h_s4, h_s5, h_s6;
+ load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+ uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
+ uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
+ uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
+ uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
+
+ uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
+ uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
+ uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
+ uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void convolve_8tap_2d_horiz_neon_i8mm(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+ int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 3);
+
+ // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+ // below for further details on possible values of block height.
+ uint8x16_t s0, s1, s2;
+ load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+ int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 =
+ vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8_4x1(dst + 2 * dst_stride, d23);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 3);
+
+ // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+ // below for further details on possible values of block height.
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2;
+ load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+
+ store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ }
+}
+
+void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+ const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+ // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+ // lines post both horizontally and vertically.
+ const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+ const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+ if (x_filter_taps == 4 && y_filter_taps == 4) {
+ const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ // 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const int8x8_t x_filter_4tap =
+ vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+ const uint8x8_t y_filter_4tap =
+ vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
+
+ convolve_4tap_2d_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+ dst, dst_stride, w, h, x_filter_4tap,
+ y_filter_4tap);
+ return;
+ }
+
+ // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+ // maximum buffer size to 64 * (64 + 7).
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+ const int im_stride = 64;
+ const int im_height = h + SUBPEL_TAPS - 1;
+
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+ convolve_8tap_2d_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+ im_block, im_stride, w, im_height,
+ x_filter_8tap);
+
+ convolve_8tap_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_8tap);
+}
+
+void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+ const int im_stride = 64;
+
+ // Averaging convolution always uses an 8-tap filter.
+ // Account for the vertical phase needing 3 lines prior and 4 lines post.
+ const int im_height = h + SUBPEL_TAPS - 1;
+ const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+ convolve_8tap_2d_horiz_neon_i8mm(src - offset - offset * src_stride,
+ src_stride, im_block, im_stride, w,
+ im_height, x_filter_8tap);
+
+ vpx_convolve8_avg_vert_neon_i8mm(im_block + offset * im_stride, im_stride,
+ dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index 57772ea668..de5fa29471 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -19,31 +19,32 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const InterpKernel *filter,
int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
- * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
- */
- uint8_t temp[64 * 72];
+ // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+ // maximum buffer size to 64 * (64 + 7) (+1 row to make it divisible by 4).
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]);
+ const int im_stride = 64;
const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
- /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
- * and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) */
- const int intermediate_height = h + vert_filter_taps;
+ // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+ // and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.)
+ const int im_height = h + vert_filter_taps;
const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
assert(y_step_q4 == 16);
assert(x_step_q4 == 16);
- /* Filter starting border_offset lines back. The Neon implementation will
- * ignore the given height and filter a multiple of 4 lines. Since this goes
- * in to the temp buffer which has lots of extra room and is subsequently
- * discarded this is safe if somewhat less than ideal. */
- vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, temp,
- w, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- intermediate_height);
+ // Filter starting border_offset rows back. The Neon implementation will
+ // ignore the given height and filter a multiple of 4 lines. Since this goes
+ // into the temporary buffer which has lots of extra room and is subsequently
+ // discarded this is safe if somewhat less than ideal.
+ vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride,
+ im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, im_height);
- /* Step into the temp buffer border_offset lines to get actual frame data. */
- vpx_convolve8_vert_neon(temp + w * border_offset, w, dst, dst_stride, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ // Step into the temporary buffer border_offset rows to get actual frame data.
+ vpx_convolve8_vert_neon(im_block + im_stride * border_offset, im_stride, dst,
+ dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, h);
}
void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
@@ -51,18 +52,21 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- uint8_t temp[64 * 72];
- const int intermediate_height = h + 8;
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]);
+ const int im_stride = 64;
+ const int im_height = h + SUBPEL_TAPS;
+ const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
assert(y_step_q4 == 16);
assert(x_step_q4 == 16);
- /* This implementation has the same issues as above. In addition, we only want
- * to average the values after both passes.
- */
- vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- intermediate_height);
- vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
- x_step_q4, y0_q4, y_step_q4, w, h);
+ // This implementation has the same issues as above. In addition, we only want
+ // to average the values after both passes.
+ vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride,
+ im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, im_height);
+
+ vpx_convolve8_avg_vert_neon(im_block + im_stride * border_offset, im_stride,
+ dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, h);
}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
deleted file mode 100644
index 9d754fde17..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/arm/vpx_convolve8_neon.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4, int w,
- int h) {
- /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
- * maximum buffer size to 64 * (64 + 7). */
- uint8_t temp[64 * 71];
-
- const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
- /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
- * and vert_filter_taps / 2 lines post. */
- const int intermediate_height = h + vert_filter_taps - 1;
- const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
-
- assert(y_step_q4 == 16);
- assert(x_step_q4 == 16);
-
- vpx_convolve8_2d_horiz_neon_dotprod(
- src - src_stride * border_offset, src_stride, temp, w, filter, x0_q4,
- x_step_q4, y0_q4, y_step_q4, w, intermediate_height);
-
- vpx_convolve8_vert_neon_dotprod(temp + w * border_offset, w, dst, dst_stride,
- filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- h);
-}
-
-void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4,
- int w, int h) {
- uint8_t temp[64 * 71];
-
- /* Averaging convolution always uses an 8-tap filter. */
- /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
- const int intermediate_height = h + 7;
-
- assert(y_step_q4 == 16);
- assert(x_step_q4 == 16);
-
- vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w,
- filter, x0_q4, x_step_q4, y0_q4,
- y_step_q4, w, intermediate_height);
-
- vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
deleted file mode 100644
index d7cbb09ea6..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/arm/vpx_convolve8_neon.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4, int w,
- int h) {
- /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
- * maximum buffer size to 64 * (64 + 7). */
- uint8_t temp[64 * 71];
-
- const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
- /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
- * and vert_filter_taps / 2 lines post. */
- const int intermediate_height = h + vert_filter_taps - 1;
- const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
-
- assert(y_step_q4 == 16);
- assert(x_step_q4 == 16);
-
- vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * border_offset, src_stride,
- temp, w, filter, x0_q4, x_step_q4, y0_q4,
- y_step_q4, w, intermediate_height);
-
- vpx_convolve8_vert_neon_i8mm(temp + w * border_offset, w, dst, dst_stride,
- filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- h);
-}
-
-void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4, int w,
- int h) {
- uint8_t temp[64 * 71];
-
- /* Averaging convolution always uses an 8-tap filter. */
- /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
- const int intermediate_height = h + 7;
-
- assert(y_step_q4 == 16);
- assert(x_step_q4 == 16);
-
- vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w,
- filter, x0_q4, x_step_q4, y0_q4, y_step_q4,
- w, intermediate_height);
-
- vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
new file mode 100644
index 0000000000..bf9f18c7e6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
+#define VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// Some very useful instructions are exclusive to the SVE2 instruction set.
+// However, we can access these instructions from a predominantly Neon context
+// by making use of the Neon-SVE bridge intrinsics to reinterpret Neon vectors
+// as SVE vectors - with the high part of the SVE vector (if it's longer than
+// 128 bits) being "don't care".
+
+static INLINE int16x8_t vpx_tbl2_s16(int16x8_t s0, int16x8_t s1,
+ uint16x8_t tbl) {
+ svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0),
+ svset_neonq_s16(svundef_s16(), s1));
+ return svget_neonq_s16(
+ svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl)));
+}
+
+#endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
new file mode 100644
index 0000000000..48534fb70e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
+#define VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// Dot product instructions operating on 16-bit input elements are exclusive to
+// the SVE instruction set. However, we can access these instructions from a
+// predominantly Neon context by making use of the Neon-SVE bridge intrinsics
+// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE
+// vector (if it's longer than 128 bits) being "don't care".
+
+// While sub-optimal on machines that have SVE vector length > 128-bit - as the
+// remainder of the vector is unused - this approach is still beneficial when
+// compared to a Neon-only solution.
+
+static INLINE uint64x2_t vpx_dotq_u16(uint64x2_t acc, uint16x8_t x,
+ uint16x8_t y) {
+ return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+ svset_neonq_u16(svundef_u16(), x),
+ svset_neonq_u16(svundef_u16(), y)));
+}
+
+static INLINE int64x2_t vpx_dotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+ return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+ svset_neonq_s16(svundef_s16(), x),
+ svset_neonq_s16(svundef_s16(), y)));
+}
+
+#define vpx_dotq_lane_s16(acc, x, y, lane) \
+ svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), acc), \
+ svset_neonq_s16(svundef_s16(), x), \
+ svset_neonq_s16(svundef_s16(), y), lane))
+
+static INLINE uint16x8_t vpx_tbl_u16(uint16x8_t data, uint16x8_t indices) {
+ return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), data),
+ svset_neonq_u16(svundef_u16(), indices)));
+}
+
+#endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
index b8e3c5e540..9bd5ec285c 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -20,263 +20,271 @@
#include "vpx_dsp/arm/vpx_convolve8_neon.h"
#include "vpx_ports/mem.h"
-static INLINE void scaledconvolve_horiz_w4(
+static INLINE void scaledconvolve_horiz_neon(
const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
- const int x0_q4, const int x_step_q4, const int w, const int h) {
- DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
- int x, y, z;
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
+ const int x0_q4, const int x_step_q4, int w, int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
src -= SUBPEL_TAPS / 2 - 1;
- y = h;
- do {
- int x_q4 = x0_q4;
- x = 0;
+ if (w == 4) {
do {
- // process 4 src_x steps
- for (z = 0; z < 4; ++z) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ int x_q4 = x0_q4;
+
+ // Process a 4x4 tile.
+ for (int r = 0; r < 4; ++r) {
+ const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+
if (x_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
- uint8x8_t s[8], d;
- int16x8_t ss[4];
- int16x4_t t[8], tt;
-
- load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
- transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
-
- ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
- ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
- ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
- ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
- t[0] = vget_low_s16(ss[0]);
- t[1] = vget_low_s16(ss[1]);
- t[2] = vget_low_s16(ss[2]);
- t[3] = vget_low_s16(ss[3]);
- t[4] = vget_high_s16(ss[0]);
- t[5] = vget_high_s16(ss[1]);
- t[6] = vget_high_s16(ss[2]);
- t[7] = vget_high_s16(ss[3]);
-
- tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
- filters);
- d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
- vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+ const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]);
+
+ uint8x8_t t0, t1, t2, t3;
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ uint8x8_t d0 =
+ vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS);
+
+ store_u8_4x1(&temp[4 * r], d0);
} else {
- int i;
- for (i = 0; i < 4; ++i) {
- temp[z * 4 + i] = src_x[i * src_stride + 3];
+ // Memcpy for non-subpel locations.
+ s += SUBPEL_TAPS / 2 - 1;
+
+ for (int c = 0; c < 4; ++c) {
+ temp[r * 4 + c] = s[c * src_stride];
}
}
x_q4 += x_step_q4;
}
- // transpose the 4x4 filters values back to dst
- {
- const uint8x8x4_t d4 = vld4_u8(temp);
- vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
- vreinterpret_u32_u8(d4.val[0]), 0);
- vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
- vreinterpret_u32_u8(d4.val[1]), 0);
- vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
- vreinterpret_u32_u8(d4.val[2]), 0);
- vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
- vreinterpret_u32_u8(d4.val[3]), 0);
- }
- x += 4;
- } while (x < w);
+ // Transpose the 4x4 result tile and store.
+ uint8x8_t d01 = vld1_u8(temp + 0);
+ uint8x8_t d23 = vld1_u8(temp + 8);
- src += src_stride * 4;
- dst += dst_stride * 4;
- y -= 4;
- } while (y > 0);
-}
+ transpose_u8_4x4(&d01, &d23);
-static INLINE void scaledconvolve_horiz_w8(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
- const int x0_q4, const int x_step_q4, const int w, const int h) {
- DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
- int x, y, z;
- src -= SUBPEL_TAPS / 2 - 1;
+ store_u8_4x1(dst + 0 * dst_stride, d01);
+ store_u8_4x1(dst + 1 * dst_stride, d23);
+ store_u8_4x1_high(dst + 2 * dst_stride, d01);
+ store_u8_4x1_high(dst + 3 * dst_stride, d23);
- // This function processes 8x8 areas. The intermediate height is not always
- // a multiple of 8, so force it to be a multiple of 8 here.
- y = (h + 7) & ~7;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ return;
+ }
do {
int x_q4 = x0_q4;
- x = 0;
+ uint8_t *d = dst;
+ int width = w;
+
do {
- uint8x8_t d[8];
- // process 8 src_x steps
- for (z = 0; z < 8; ++z) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ // Process an 8x8 tile.
+ for (int r = 0; r < 8; ++r) {
+ const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
if (x_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
- uint8x8_t s[8];
- load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
- &s[5], &s[6], &s[7]);
- transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
- &s[7]);
- d[0] = scale_filter_8(s, filters);
- vst1_u8(&temp[8 * z], d[0]);
+ const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+ vst1_u8(&temp[r * 8], d0);
} else {
- int i;
- for (i = 0; i < 8; ++i) {
- temp[z * 8 + i] = src_x[i * src_stride + 3];
+ // Memcpy for non-subpel locations.
+ s += SUBPEL_TAPS / 2 - 1;
+
+ for (int c = 0; c < 8; ++c) {
+ temp[r * 8 + c] = s[c * src_stride];
}
}
x_q4 += x_step_q4;
}
- // transpose the 8x8 filters values back to dst
- load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
- &d[7]);
- transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
- vst1_u8(&dst[x + 0 * dst_stride], d[0]);
- vst1_u8(&dst[x + 1 * dst_stride], d[1]);
- vst1_u8(&dst[x + 2 * dst_stride], d[2]);
- vst1_u8(&dst[x + 3 * dst_stride], d[3]);
- vst1_u8(&dst[x + 4 * dst_stride], d[4]);
- vst1_u8(&dst[x + 5 * dst_stride], d[5]);
- vst1_u8(&dst[x + 6 * dst_stride], d[6]);
- vst1_u8(&dst[x + 7 * dst_stride], d[7]);
- x += 8;
- } while (x < w);
-
- src += src_stride * 8;
- dst += dst_stride * 8;
- } while (y -= 8);
-}
+ // Transpose the 8x8 result tile and store.
+ uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+ load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-static INLINE void scaledconvolve_vert_w4(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
- const int y0_q4, const int y_step_q4, const int w, const int h) {
- int y;
- int y_q4 = y0_q4;
+ transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- y = h;
- do {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
- if (y_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- uint8x8_t s[8], d;
- int16x4_t t[8], tt;
-
- load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
- &s[6], &s[7]);
- t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
- t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
- t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
- t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
- t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
- t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
- t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
- t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
-
- tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
- d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
- } else {
- memcpy(dst, &src_y[3 * src_stride], w);
- }
+ d += 8;
+ width -= 8;
+ } while (width != 0);
- dst += dst_stride;
- y_q4 += y_step_q4;
- } while (--y);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
}
-static INLINE void scaledconvolve_vert_w8(
+static INLINE void scaledconvolve_vert_neon(
const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
- const int y0_q4, const int y_step_q4, const int w, const int h) {
- int y;
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
+ const int y0_q4, const int y_step_q4, int w, int h) {
int y_q4 = y0_q4;
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- y = h;
- do {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- if (y_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- uint8x8_t s[8], d;
- load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
- &s[6], &s[7]);
- d = scale_filter_8(s, filters);
- vst1_u8(dst, d);
- } else {
- memcpy(dst, &src_y[3 * src_stride], w);
- }
- dst += dst_stride;
- y_q4 += y_step_q4;
- } while (--y);
-}
+ if (w == 4) {
+ do {
+ const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-static INLINE void scaledconvolve_vert_w16(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
- const int y0_q4, const int y_step_q4, const int w, const int h) {
- int x, y;
- int y_q4 = y0_q4;
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+ int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+
+ int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ uint8x8_t d0 =
+ vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS);
+
+ store_u8_4x1(dst, d0);
+ } else {
+ // Memcpy for non-subpel locations.
+ memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4);
+ }
+
+ y_q4 += y_step_q4;
+ dst += dst_stride;
+ } while (--h != 0);
+ return;
+ }
+
+ if (w == 8) {
+ do {
+ const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+ vst1_u8(dst, d0);
+ } else {
+ // Memcpy for non-subpel locations.
+ memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8);
+ }
+
+ y_q4 += y_step_q4;
+ dst += dst_stride;
+ } while (--h != 0);
+ return;
+ }
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- y = h;
do {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ uint8_t *d = dst;
+ int width = w;
+
if (y_q4 & SUBPEL_MASK) {
- x = 0;
do {
- const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- uint8x16_t ss[8];
- uint8x8_t s[8], d[2];
- load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
- &ss[5], &ss[6], &ss[7]);
- s[0] = vget_low_u8(ss[0]);
- s[1] = vget_low_u8(ss[1]);
- s[2] = vget_low_u8(ss[2]);
- s[3] = vget_low_u8(ss[3]);
- s[4] = vget_low_u8(ss[4]);
- s[5] = vget_low_u8(ss[5]);
- s[6] = vget_low_u8(ss[6]);
- s[7] = vget_low_u8(ss[7]);
- d[0] = scale_filter_8(s, filters);
-
- s[0] = vget_high_u8(ss[0]);
- s[1] = vget_high_u8(ss[1]);
- s[2] = vget_high_u8(ss[2]);
- s[3] = vget_high_u8(ss[3]);
- s[4] = vget_high_u8(ss[4]);
- s[5] = vget_high_u8(ss[5]);
- s[6] = vget_high_u8(ss[6]);
- s[7] = vget_high_u8(ss[7]);
- d[1] = scale_filter_8(s, filters);
- vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
- src_y += 16;
- x += 16;
- } while (x < w);
+ const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+ uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+ s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+ s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+ s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+ s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+ s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4)));
+ s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5)));
+ s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6)));
+ s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7)));
+
+ s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+ s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+ s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+ s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+ s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4)));
+ s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5)));
+ s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6)));
+ s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7)));
+
+ uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
+ s6[0], s7[0], filter);
+ uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
+ s6[1], s7[1], filter);
+
+ vst1q_u8(d, vcombine_u8(d0, d1));
+
+ s += 16;
+ d += 16;
+ width -= 16;
+ } while (width != 0);
} else {
- memcpy(dst, &src_y[3 * src_stride], w);
+ // Memcpy for non-subpel locations.
+ s += (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(s);
+ vst1q_u8(d, s0);
+ s += 16;
+ d += 16;
+ width -= 16;
+ } while (width != 0);
}
- dst += dst_stride;
+
y_q4 += y_step_q4;
- } while (--y);
+ dst += dst_stride;
+ } while (--h != 0);
}
void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const InterpKernel *filter,
int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // Fixed size intermediate buffer, im_block, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
// (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
+ // Deriving the maximum number of rows in the im_block buffer (135):
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
// --Largest block size is 64x64 pixels.
// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
@@ -288,33 +296,20 @@ void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
// When calling in frame scaling function, the smallest scaling factor is x1/4
// ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
// big enough.
- DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
- const int intermediate_height =
+ DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]);
+ const int im_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+ const ptrdiff_t im_stride = 64;
assert(w <= 64);
assert(h <= 64);
assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
assert(x_step_q4 <= 64);
- if (w >= 8) {
- scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
- intermediate_height);
- } else {
- scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
- intermediate_height);
- }
+ scaledconvolve_horiz_neon(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, im_block, im_stride, filter, x0_q4,
+ x_step_q4, w, im_height);
- if (w >= 16) {
- scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, filter, y0_q4, y_step_q4, w, h);
- } else if (w == 8) {
- scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, filter, y0_q4, y_step_q4, w, h);
- } else {
- scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, filter, y0_q4, y_step_q4, w, h);
- }
+ scaledconvolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4,
+ y_step_q4, w, h);
}
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
index 2bee91f449..916dc62cef 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
@@ -112,7 +112,8 @@ DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c
-DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c
+DSP_SRCS-$(HAVE_SVE) += arm/highbd_vpx_convolve8_sve.c
+DSP_SRCS-$(HAVE_SVE2) += arm/highbd_vpx_convolve8_sve2.c
endif
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm
@@ -139,9 +140,7 @@ DSP_SRCS-yes += arm/vpx_convolve8_neon.c
DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
DSP_SRCS-yes += arm/vpx_convolve_neon.c
DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c
-DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c
DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c
-DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
@@ -374,6 +373,7 @@ DSP_SRCS-yes += sad.c
DSP_SRCS-yes += subtract.c
DSP_SRCS-yes += sum_squares.c
DSP_SRCS-$(HAVE_NEON) += arm/sum_squares_neon.c
+DSP_SRCS-$(HAVE_SVE) += arm/sum_squares_sve.c
DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c
@@ -454,6 +454,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_sse_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/highbd_variance_neon_dotprod.c
+DSP_SRCS-$(HAVE_SVE) += arm/highbd_variance_sve.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
index 030c456d39..2b8c656afb 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
@@ -12,4 +12,4 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/vpx_once.h"
-void vpx_dsp_rtcd() { once(setup_rtcd_internal); }
+void vpx_dsp_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 18087e25d9..f40f85c036 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -427,19 +427,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
- specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
+ specialize qw/vpx_highbd_convolve8_horiz avx2 neon sve/, "$sse2_x86_64";
add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
- specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
+ specialize qw/vpx_highbd_convolve8_vert avx2 neon sve2/, "$sse2_x86_64";
add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
- specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
+ specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon sve/, "$sse2_x86_64";
add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
- specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
+ specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon sve2/, "$sse2_x86_64";
} # CONFIG_VP9_HIGHBITDEPTH
if (vpx_config("CONFIG_VP9") eq "yes") {
@@ -1009,7 +1009,7 @@ add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride,
specialize qw/vpx_sad_skip_4x4x4d neon/;
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
-specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
+specialize qw/vpx_sum_squares_2d_i16 neon sve sse2 msa/;
#
# Structured Similarity (SSIM)
@@ -1411,163 +1411,163 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance64x64 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance64x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance64x32 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance64x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x64 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance32x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x32 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance32x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x16 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance32x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x32 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance16x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x16 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance16x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x8 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance16x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance8x16 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance8x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance8x8 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance8x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance8x4 neon/;
+ specialize qw/vpx_highbd_12_variance8x4 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance4x8 neon/;
+ specialize qw/vpx_highbd_12_variance4x8 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance4x4 neon/;
+ specialize qw/vpx_highbd_12_variance4x4 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance64x64 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance64x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance64x32 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance64x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x64 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance32x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x32 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance32x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x16 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance32x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x32 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance16x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x16 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance16x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x8 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance16x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance8x16 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance8x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance8x8 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance8x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance8x4 neon/;
+ specialize qw/vpx_highbd_10_variance8x4 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance4x8 neon/;
+ specialize qw/vpx_highbd_10_variance4x8 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance4x4 neon/;
+ specialize qw/vpx_highbd_10_variance4x4 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance64x64 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance64x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance64x32 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance64x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x64 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance32x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x32 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance32x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x16 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance32x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x32 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance16x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x16 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance16x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x8 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance16x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance8x16 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance8x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance8x8 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance8x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance8x4 neon/;
+ specialize qw/vpx_highbd_8_variance8x4 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance4x8 neon/;
+ specialize qw/vpx_highbd_8_variance4x8 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance4x4 neon/;
+ specialize qw/vpx_highbd_8_variance4x4 neon sve/;
add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_8_get16x16var sse2 neon/;
+ specialize qw/vpx_highbd_8_get16x16var sse2 neon sve/;
add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_8_get8x8var sse2 neon/;
+ specialize qw/vpx_highbd_8_get8x8var sse2 neon sve/;
add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_10_get16x16var sse2 neon/;
+ specialize qw/vpx_highbd_10_get16x16var sse2 neon sve/;
add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_10_get8x8var sse2 neon/;
+ specialize qw/vpx_highbd_10_get8x8var sse2 neon sve/;
add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_12_get16x16var sse2 neon/;
+ specialize qw/vpx_highbd_12_get16x16var sse2 neon sve/;
add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
+ specialize qw/vpx_highbd_12_get8x8var sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
+ specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/;
add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse16x8 neon/;
+ specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/;
add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse8x16 neon/;
+ specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/;
add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
+ specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/;
add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
+ specialize qw/vpx_highbd_10_mse16x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse16x8 neon/;
+ specialize qw/vpx_highbd_10_mse16x8 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse8x16 neon/;
+ specialize qw/vpx_highbd_10_mse8x16 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse8x8 sse2 neon/;
+ specialize qw/vpx_highbd_10_mse8x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse16x16 sse2 neon/;
+ specialize qw/vpx_highbd_12_mse16x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse16x8 neon/;
+ specialize qw/vpx_highbd_12_mse16x8 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse8x16 neon/;
+ specialize qw/vpx_highbd_12_mse8x16 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
+ specialize qw/vpx_highbd_12_mse8x8 sse2 neon sve/;
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
index 0cddcb6991..eb8ff06cd7 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
@@ -28,7 +28,6 @@ extern "C" {
typedef int16_t InterpKernel[SUBPEL_TAPS];
static INLINE int vpx_get_filter_taps(const int16_t *const filter) {
- assert(filter[3] != 128);
if (filter[0] | filter[7]) {
return 8;
}
diff --git a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
index 539d09bb39..eba12d312a 100644
--- a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
+++ b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
@@ -15,7 +15,7 @@
#include <sys/sysctl.h>
#endif
-#if !CONFIG_RUNTIME_CPU_DETECT
+#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
static int arm_get_cpu_caps(void) {
// This function should actually be a no-op. There is no way to adjust any of
@@ -28,7 +28,7 @@ static int arm_get_cpu_caps(void) {
return flags;
}
-#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT
+#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
// sysctlbyname() parameter documentation for instruction set characteristics:
// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
@@ -99,14 +99,17 @@ static int arm_get_cpu_caps(void) {
// hwcap values are not defined should not prevent features from being enabled.
#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20)
#define VPX_AARCH64_HWCAP_SVE (1 << 22)
+#define VPX_AARCH64_HWCAP2_SVE2 (1 << 1)
#define VPX_AARCH64_HWCAP2_I8MM (1 << 13)
static int arm_get_cpu_caps(void) {
int flags = 0;
+#if HAVE_NEON_DOTPROD || HAVE_SVE
unsigned long hwcap = getauxval(AT_HWCAP);
-#if HAVE_NEON_I8MM
+#endif // HAVE_NEON_DOTPROD || HAVE_SVE
+#if HAVE_NEON_I8MM || HAVE_SVE2
unsigned long hwcap2 = getauxval(AT_HWCAP2);
-#endif // HAVE_NEON_I8MM
+#endif // HAVE_NEON_I8MM || HAVE_SVE2
#if HAVE_NEON
flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
#endif // HAVE_NEON
@@ -125,6 +128,11 @@ static int arm_get_cpu_caps(void) {
flags |= HAS_SVE;
}
#endif // HAVE_SVE
+#if HAVE_SVE2
+ if (hwcap2 & VPX_AARCH64_HWCAP2_SVE2) {
+ flags |= HAS_SVE2;
+ }
+#endif // HAVE_SVE2
return flags;
}
@@ -195,5 +203,10 @@ int arm_cpu_caps(void) {
flags &= ~HAS_SVE;
}
+ // Restrict flags: FEAT_SVE2 assumes that FEAT_SVE is available.
+ if (!(flags & HAS_SVE)) {
+ flags &= ~HAS_SVE2;
+ }
+
return flags;
}
diff --git a/media/libvpx/libvpx/vpx_ports/arm.h b/media/libvpx/libvpx/vpx_ports/arm.h
index 39365d18ee..814c3cc408 100644
--- a/media/libvpx/libvpx/vpx_ports/arm.h
+++ b/media/libvpx/libvpx/vpx_ports/arm.h
@@ -25,6 +25,8 @@ extern "C" {
#define HAS_NEON_I8MM (1 << 2)
// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
#define HAS_SVE (1 << 3)
+// Armv9.0-A SVE2 instructions.
+#define HAS_SVE2 (1 << 4)
int arm_cpu_caps(void);
diff --git a/media/libvpx/libvpx/vpx_ports/emms_mmx.c b/media/libvpx/libvpx/vpx_ports/emms_mmx.c
index f1036b98ed..79b98a75f1 100644
--- a/media/libvpx/libvpx/vpx_ports/emms_mmx.c
+++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.c
@@ -12,4 +12,4 @@
#include "vpx_ports/system_state.h"
-void vpx_clear_system_state() { _mm_empty(); }
+void vpx_clear_system_state(void) { _mm_empty(); }
diff --git a/media/libvpx/libvpx/vpx_ports/mem.h b/media/libvpx/libvpx/vpx_ports/mem.h
index 5eccfe8f50..ee9e095633 100644
--- a/media/libvpx/libvpx/vpx_ports/mem.h
+++ b/media/libvpx/libvpx/vpx_ports/mem.h
@@ -23,7 +23,13 @@
#define DECLARE_ALIGNED(n, typ, val) typ val
#endif
-#if HAVE_NEON && defined(_MSC_VER)
+#if defined(__has_builtin)
+#define VPX_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define VPX_HAS_BUILTIN(x) 0
+#endif
+
+#if !VPX_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__)
#define __builtin_prefetch(x)
#endif
diff --git a/media/libvpx/libvpx/vpx_ports/vpx_once.h b/media/libvpx/libvpx/vpx_ports/vpx_once.h
index d8a8ed89fe..d33eff4397 100644
--- a/media/libvpx/libvpx/vpx_ports/vpx_once.h
+++ b/media/libvpx/libvpx/vpx_ports/vpx_once.h
@@ -91,29 +91,6 @@ static void once(void (*func)(void)) {
return;
}
-#elif CONFIG_MULTITHREAD && defined(__OS2__)
-#define INCL_DOS
-#include <os2.h>
-static void once(void (*func)(void)) {
- static volatile int done;
-
- /* If the initialization is complete, return early. */
- if (done) return;
-
- /* Causes all other threads in the process to block themselves
- * and give up their time slice.
- */
- DosEnterCritSec();
-
- if (!done) {
- func();
- done = 1;
- }
-
- /* Restores normal thread dispatching for the current process. */
- DosExitCritSec();
-}
-
#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
#include <pthread.h>
static void once(void (*func)(void)) {
diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
index dc4d9593a8..706b0770c8 100644
--- a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
+++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
@@ -12,4 +12,4 @@
#include "./vpx_scale_rtcd.h"
#include "vpx_ports/vpx_once.h"
-void vpx_scale_rtcd() { once(setup_rtcd_internal); }
+void vpx_scale_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_util/vpx_pthread.h b/media/libvpx/libvpx/vpx_util/vpx_pthread.h
new file mode 100644
index 0000000000..cdd18d0f30
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_pthread.h
@@ -0,0 +1,157 @@
+// Copyright 2024 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// pthread.h wrapper
+
+#ifndef VPX_VPX_UTIL_VPX_PTHREAD_H_
+#define VPX_VPX_UTIL_VPX_PTHREAD_H_
+
+#include "./vpx_config.h"
+
+#if CONFIG_MULTITHREAD
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+// Prevent leaking max/min macros.
+#undef NOMINMAX
+#define NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <errno.h> // NOLINT
+#include <process.h> // NOLINT
+#include <stddef.h> // NOLINT
+#include <windows.h> // NOLINT
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+
+#if _WIN32_WINNT < 0x0600
+#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
+#endif
+typedef CONDITION_VARIABLE pthread_cond_t;
+
+#ifndef WINAPI_FAMILY_PARTITION
+#define WINAPI_PARTITION_DESKTOP 1
+#define WINAPI_FAMILY_PARTITION(x) x
+#endif
+
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define USE_CREATE_THREAD
+#endif
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+// _beginthreadex requires __stdcall
+#if defined(__GNUC__) && \
+ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
+#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
+#else
+#define THREADFN unsigned int __stdcall
+#endif
+#define THREAD_EXIT_SUCCESS 0
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+ unsigned int(__stdcall *start)(void *),
+ void *arg) {
+ (void)attr;
+#ifdef USE_CREATE_THREAD
+ *thread = CreateThread(NULL, /* lpThreadAttributes */
+ 0, /* dwStackSize */
+ start, arg, 0, /* dwStackSize */
+ NULL); /* lpThreadId */
+#else
+ *thread = (pthread_t)_beginthreadex(NULL, /* void *security */
+ 0, /* unsigned stack_size */
+ start, arg, 0, /* unsigned initflag */
+ NULL); /* unsigned *thrdaddr */
+#endif
+ if (*thread == NULL) return 1;
+ SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+ return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+ (void)value_ptr;
+ return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) !=
+ WAIT_OBJECT_0 ||
+ CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+ void *mutexattr) {
+ (void)mutexattr;
+ InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
+ return 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+ return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+ EnterCriticalSection(mutex);
+ return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+ LeaveCriticalSection(mutex);
+ return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+ DeleteCriticalSection(mutex);
+ return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+ (void)condition;
+ return 0;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+ void *cond_attr) {
+ (void)cond_attr;
+ InitializeConditionVariable(condition);
+ return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+ WakeConditionVariable(condition);
+ return 0;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+ WakeAllConditionVariable(condition);
+ return 0;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+ pthread_mutex_t *const mutex) {
+ int ok;
+ ok = SleepConditionVariableCS(condition, mutex, INFINITE);
+ return !ok;
+}
+#else // _WIN32
+#include <pthread.h> // NOLINT
+#define THREADFN void *
+#define THREAD_EXIT_SUCCESS NULL
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // CONFIG_MULTITHREAD
+
+#endif // VPX_VPX_UTIL_VPX_PTHREAD_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.c b/media/libvpx/libvpx/vpx_util/vpx_thread.c
index 04c5fb6f26..0d0e2f5766 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_thread.c
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.c
@@ -12,10 +12,18 @@
// Original source:
// https://chromium.googlesource.com/webm/libwebp
+// Enable GNU extensions in glibc so that we can call pthread_setname_np().
+// This must be before any #include statements.
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
#include <assert.h>
#include <string.h> // for memset()
+#include "./vpx_config.h"
#include "./vpx_thread.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
#if CONFIG_MULTITHREAD
@@ -31,23 +39,54 @@ static void execute(VPxWorker *const worker); // Forward declaration.
static THREADFN thread_loop(void *ptr) {
VPxWorker *const worker = (VPxWorker *)ptr;
- int done = 0;
- while (!done) {
- pthread_mutex_lock(&worker->impl_->mutex_);
- while (worker->status_ == OK) { // wait in idling mode
+#ifdef __APPLE__
+ if (worker->thread_name != NULL) {
+ // Apple's version of pthread_setname_np takes one argument and operates on
+ // the current thread only. The maximum size of the thread_name buffer was
+ // noted in the Chromium source code and was confirmed by experiments. If
+ // thread_name is too long, pthread_setname_np returns -1 with errno
+ // ENAMETOOLONG (63).
+ char thread_name[64];
+ strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+ thread_name[sizeof(thread_name) - 1] = '\0';
+ pthread_setname_np(thread_name);
+ }
+#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__)
+ if (worker->thread_name != NULL) {
+ // Linux and Android require names (with nul) fit in 16 chars, otherwise
+ // pthread_setname_np() returns ERANGE (34).
+ char thread_name[16];
+ strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+ thread_name[sizeof(thread_name) - 1] = '\0';
+ pthread_setname_np(pthread_self(), thread_name);
+ }
+#endif
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ for (;;) {
+ while (worker->status_ == VPX_WORKER_STATUS_OK) { // wait in idling mode
pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
}
- if (worker->status_ == WORK) {
+ if (worker->status_ == VPX_WORKER_STATUS_WORKING) {
+ // When worker->status_ is VPX_WORKER_STATUS_WORKING, the main thread
+ // doesn't change worker->status_ and will wait until the worker changes
+ // worker->status_ to VPX_WORKER_STATUS_OK. See change_state(). So the
+ // worker can safely call execute() without holding worker->impl_->mutex_.
+ // When the worker reacquires worker->impl_->mutex_, worker->status_ must
+ // still be VPX_WORKER_STATUS_WORKING.
+ pthread_mutex_unlock(&worker->impl_->mutex_);
execute(worker);
- worker->status_ = OK;
- } else if (worker->status_ == NOT_OK) { // finish the worker
- done = 1;
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ assert(worker->status_ == VPX_WORKER_STATUS_WORKING);
+ worker->status_ = VPX_WORKER_STATUS_OK;
+ // signal to the main thread that we're done (for sync())
+ pthread_cond_signal(&worker->impl_->condition_);
+ } else {
+ assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK); // finish the worker
+ break;
}
- // signal to the main thread that we're done (for sync())
- pthread_cond_signal(&worker->impl_->condition_);
- pthread_mutex_unlock(&worker->impl_->mutex_);
}
- return THREAD_RETURN(NULL); // Thread is finished
+ pthread_mutex_unlock(&worker->impl_->mutex_);
+ return THREAD_EXIT_SUCCESS; // Thread is finished
}
// main thread state control
@@ -58,13 +97,13 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
if (worker->impl_ == NULL) return;
pthread_mutex_lock(&worker->impl_->mutex_);
- if (worker->status_ >= OK) {
+ if (worker->status_ >= VPX_WORKER_STATUS_OK) {
// wait for the worker to finish
- while (worker->status_ != OK) {
+ while (worker->status_ != VPX_WORKER_STATUS_OK) {
pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
}
// assign new status and release the working thread if needed
- if (new_status != OK) {
+ if (new_status != VPX_WORKER_STATUS_OK) {
worker->status_ = new_status;
pthread_cond_signal(&worker->impl_->condition_);
}
@@ -78,21 +117,21 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
static void init(VPxWorker *const worker) {
memset(worker, 0, sizeof(*worker));
- worker->status_ = NOT_OK;
+ worker->status_ = VPX_WORKER_STATUS_NOT_OK;
}
static int sync(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
- change_state(worker, OK);
+ change_state(worker, VPX_WORKER_STATUS_OK);
#endif
- assert(worker->status_ <= OK);
+ assert(worker->status_ <= VPX_WORKER_STATUS_OK);
return !worker->had_error;
}
static int reset(VPxWorker *const worker) {
int ok = 1;
worker->had_error = 0;
- if (worker->status_ < OK) {
+ if (worker->status_ < VPX_WORKER_STATUS_OK) {
#if CONFIG_MULTITHREAD
worker->impl_ = (VPxWorkerImpl *)vpx_calloc(1, sizeof(*worker->impl_));
if (worker->impl_ == NULL) {
@@ -107,7 +146,7 @@ static int reset(VPxWorker *const worker) {
}
pthread_mutex_lock(&worker->impl_->mutex_);
ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
- if (ok) worker->status_ = OK;
+ if (ok) worker->status_ = VPX_WORKER_STATUS_OK;
pthread_mutex_unlock(&worker->impl_->mutex_);
if (!ok) {
pthread_mutex_destroy(&worker->impl_->mutex_);
@@ -118,12 +157,12 @@ static int reset(VPxWorker *const worker) {
return 0;
}
#else
- worker->status_ = OK;
+ worker->status_ = VPX_WORKER_STATUS_OK;
#endif
- } else if (worker->status_ > OK) {
+ } else if (worker->status_ > VPX_WORKER_STATUS_OK) {
ok = sync(worker);
}
- assert(!ok || (worker->status_ == OK));
+ assert(!ok || (worker->status_ == VPX_WORKER_STATUS_OK));
return ok;
}
@@ -135,7 +174,7 @@ static void execute(VPxWorker *const worker) {
static void launch(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
- change_state(worker, WORK);
+ change_state(worker, VPX_WORKER_STATUS_WORKING);
#else
execute(worker);
#endif
@@ -144,7 +183,7 @@ static void launch(VPxWorker *const worker) {
static void end(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
if (worker->impl_ != NULL) {
- change_state(worker, NOT_OK);
+ change_state(worker, VPX_WORKER_STATUS_NOT_OK);
pthread_join(worker->impl_->thread_, NULL);
pthread_mutex_destroy(&worker->impl_->mutex_);
pthread_cond_destroy(&worker->impl_->condition_);
@@ -152,10 +191,10 @@ static void end(VPxWorker *const worker) {
worker->impl_ = NULL;
}
#else
- worker->status_ = NOT_OK;
+ worker->status_ = VPX_WORKER_STATUS_NOT_OK;
assert(worker->impl_ == NULL);
#endif
- assert(worker->status_ == NOT_OK);
+ assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK);
}
//------------------------------------------------------------------------------
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.h b/media/libvpx/libvpx/vpx_util/vpx_thread.h
index 6d308e949b..11a1d74387 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_thread.h
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.h
@@ -15,370 +15,22 @@
#ifndef VPX_VPX_UTIL_VPX_THREAD_H_
#define VPX_VPX_UTIL_VPX_THREAD_H_
-#include "./vpx_config.h"
-
#ifdef __cplusplus
extern "C" {
#endif
-// Set maximum decode threads to be 8 due to the limit of frame buffers
-// and not enough semaphores in the emulation layer on windows.
-#define MAX_DECODE_THREADS 8
-
-#if CONFIG_MULTITHREAD
-
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-#include <errno.h> // NOLINT
-#include <process.h> // NOLINT
-#include <windows.h> // NOLINT
-typedef HANDLE pthread_t;
-typedef CRITICAL_SECTION pthread_mutex_t;
-
-#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater
-#define USE_WINDOWS_CONDITION_VARIABLE
-typedef CONDITION_VARIABLE pthread_cond_t;
-#else
-typedef struct {
- HANDLE waiting_sem_;
- HANDLE received_sem_;
- HANDLE signal_event_;
-} pthread_cond_t;
-#endif // _WIN32_WINNT >= 0x600
-
-#ifndef WINAPI_FAMILY_PARTITION
-#define WINAPI_PARTITION_DESKTOP 1
-#define WINAPI_FAMILY_PARTITION(x) x
-#endif
-
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define USE_CREATE_THREAD
-#endif
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-// _beginthreadex requires __stdcall
-#if defined(__GNUC__) && \
- (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
-#else
-#define THREADFN unsigned int __stdcall
-#endif
-#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
-
-#if _WIN32_WINNT >= 0x0501 // Windows XP or greater
-#define WaitForSingleObject(obj, timeout) \
- WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/)
-#endif
-
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
- unsigned int(__stdcall *start)(void *),
- void *arg) {
- (void)attr;
-#ifdef USE_CREATE_THREAD
- *thread = CreateThread(NULL, /* lpThreadAttributes */
- 0, /* dwStackSize */
- start, arg, 0, /* dwStackSize */
- NULL); /* lpThreadId */
-#else
- *thread = (pthread_t)_beginthreadex(NULL, /* void *security */
- 0, /* unsigned stack_size */
- start, arg, 0, /* unsigned initflag */
- NULL); /* unsigned *thrdaddr */
-#endif
- if (*thread == NULL) return 1;
- SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
- return 0;
-}
-
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
- (void)value_ptr;
- return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
- CloseHandle(thread) == 0);
-}
-
-// Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
- void *mutexattr) {
- (void)mutexattr;
-#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater
- InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
-#else
- InitializeCriticalSection(mutex);
-#endif
- return 0;
-}
-
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
- return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
-}
-
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
- EnterCriticalSection(mutex);
- return 0;
-}
-
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
- LeaveCriticalSection(mutex);
- return 0;
-}
-
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
- DeleteCriticalSection(mutex);
- return 0;
-}
-
-// Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
- int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
- (void)condition;
-#else
- ok &= (CloseHandle(condition->waiting_sem_) != 0);
- ok &= (CloseHandle(condition->received_sem_) != 0);
- ok &= (CloseHandle(condition->signal_event_) != 0);
-#endif
- return !ok;
-}
-
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
- void *cond_attr) {
- (void)cond_attr;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
- InitializeConditionVariable(condition);
-#else
- condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
- condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
- condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
- if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL ||
- condition->signal_event_ == NULL) {
- pthread_cond_destroy(condition);
- return 1;
- }
-#endif
- return 0;
-}
-
-static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
- int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
- WakeAllConditionVariable(condition);
-#else
- while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
- // a thread is waiting in pthread_cond_wait: allow it to be notified
- ok &= SetEvent(condition->signal_event_);
- // wait until the event is consumed so the signaler cannot consume
- // the event via its own pthread_cond_wait.
- ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
- WAIT_OBJECT_0);
- }
-#endif
- return !ok;
-}
-
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
- int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
- WakeConditionVariable(condition);
-#else
- if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
- // a thread is waiting in pthread_cond_wait: allow it to be notified
- ok = SetEvent(condition->signal_event_);
- // wait until the event is consumed so the signaler cannot consume
- // the event via its own pthread_cond_wait.
- ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
- WAIT_OBJECT_0);
- }
-#endif
- return !ok;
-}
-
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
- pthread_mutex_t *const mutex) {
- int ok;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
- ok = SleepConditionVariableCS(condition, mutex, INFINITE);
-#else
- // note that there is a consumer available so the signal isn't dropped in
- // pthread_cond_signal
- if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
- // now unlock the mutex so pthread_cond_signal may be issued
- pthread_mutex_unlock(mutex);
- ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
- WAIT_OBJECT_0);
- ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
- pthread_mutex_lock(mutex);
-#endif
- return !ok;
-}
-
-#elif defined(__OS2__)
-#define INCL_DOS
-#include <os2.h> // NOLINT
-
-#include <errno.h> // NOLINT
-#include <stdlib.h> // NOLINT
-#include <sys/builtin.h> // NOLINT
-
-#if defined(__STRICT_ANSI__)
-// _beginthread() is not declared on __STRICT_ANSI__ mode. Declare here.
-int _beginthread(void (*)(void *), void *, unsigned, void *);
-#endif
-
-#define pthread_t TID
-#define pthread_mutex_t HMTX
-
-typedef struct {
- HEV event_sem_;
- HEV ack_sem_;
- volatile unsigned wait_count_;
-} pthread_cond_t;
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-#define THREADFN void *
-#define THREAD_RETURN(val) (val)
-
-typedef struct {
- void *(*start_)(void *);
- void *arg_;
-} thread_arg;
-
-static void thread_start(void *arg) {
- thread_arg targ = *(thread_arg *)arg;
- free(arg);
-
- targ.start_(targ.arg_);
-}
-
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
- void *(*start)(void *), void *arg) {
- int tid;
- thread_arg *targ = (thread_arg *)malloc(sizeof(*targ));
- if (targ == NULL) return 1;
-
- (void)attr;
-
- targ->start_ = start;
- targ->arg_ = arg;
- tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ);
- if (tid == -1) {
- free(targ);
- return 1;
- }
-
- *thread = tid;
- return 0;
-}
-
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
- (void)value_ptr;
- return DosWaitThread(&thread, DCWW_WAIT) != 0;
-}
-
-// Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
- void *mutexattr) {
- (void)mutexattr;
- return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0;
-}
-
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
- return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY;
-}
-
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
- return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0;
-}
-
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
- return DosReleaseMutexSem(*mutex) != 0;
-}
-
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
- return DosCloseMutexSem(*mutex) != 0;
-}
-
-// Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
- int ok = 1;
- ok &= DosCloseEventSem(condition->event_sem_) == 0;
- ok &= DosCloseEventSem(condition->ack_sem_) == 0;
- return !ok;
-}
-
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
- void *cond_attr) {
- int ok = 1;
- (void)cond_attr;
-
- ok &=
- DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0;
- ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0;
- if (!ok) {
- pthread_cond_destroy(condition);
- return 1;
- }
- condition->wait_count_ = 0;
- return 0;
-}
-
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
- int ok = 1;
-
- if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) {
- ok &= DosPostEventSem(condition->event_sem_) == 0;
- ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0;
- }
-
- return !ok;
-}
-
-static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
- int ok = 1;
-
- while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0))
- ok &= pthread_cond_signal(condition) == 0;
-
- return !ok;
-}
-
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
- pthread_mutex_t *const mutex) {
- int ok = 1;
-
- __atomic_increment(&condition->wait_count_);
-
- ok &= pthread_mutex_unlock(mutex) == 0;
-
- ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0;
-
- __atomic_decrement(&condition->wait_count_);
-
- ok &= DosPostEventSem(condition->ack_sem_) == 0;
-
- pthread_mutex_lock(mutex);
-
- return !ok;
-}
-#else // _WIN32
-#include <pthread.h> // NOLINT
-#define THREADFN void *
-#define THREAD_RETURN(val) val
-#endif
-
-#endif // CONFIG_MULTITHREAD
+#define MAX_NUM_THREADS 64
// State of the worker thread object
typedef enum {
- NOT_OK = 0, // object is unusable
- OK, // ready to work
- WORK // busy finishing the current task
+ VPX_WORKER_STATUS_NOT_OK = 0, // object is unusable
+ VPX_WORKER_STATUS_OK, // ready to work
+ VPX_WORKER_STATUS_WORKING // busy finishing the current task
} VPxWorkerStatus;
// Function to be called by the worker thread. Takes two opaque pointers as
-// arguments (data1 and data2), and should return false in case of error.
+// arguments (data1 and data2). Should return true on success and return false
+// in case of error.
typedef int (*VPxWorkerHook)(void *, void *);
// Platform-dependent implementation details for the worker.
@@ -388,10 +40,14 @@ typedef struct VPxWorkerImpl VPxWorkerImpl;
typedef struct {
VPxWorkerImpl *impl_;
VPxWorkerStatus status_;
+ // Thread name for the debugger. If not NULL, must point to a string that
+ // outlives the worker thread. For portability, use a name <= 15 characters
+ // long (not including the terminating NUL character).
+ const char *thread_name;
VPxWorkerHook hook; // hook to call
void *data1; // first argument passed to 'hook'
void *data2; // second argument passed to 'hook'
- int had_error; // return value of the last call to 'hook'
+ int had_error; // true if a call to 'hook' returned false
} VPxWorker;
// The interface for all thread-worker related functions. All these functions
diff --git a/media/libvpx/libvpx/vpx_util/vpx_util.mk b/media/libvpx/libvpx/vpx_util/vpx_util.mk
index 1162714956..948e6d6f89 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_util.mk
+++ b/media/libvpx/libvpx/vpx_util/vpx_util.mk
@@ -10,6 +10,7 @@
UTIL_SRCS-yes += vpx_atomics.h
UTIL_SRCS-yes += vpx_util.mk
+UTIL_SRCS-yes += vpx_pthread.h
UTIL_SRCS-yes += vpx_thread.c
UTIL_SRCS-yes += vpx_thread.h
UTIL_SRCS-yes += endian_inl.h
diff --git a/media/libvpx/missing_header.patch b/media/libvpx/missing_header.patch
new file mode 100644
index 0000000000..02b77170ee
--- /dev/null
+++ b/media/libvpx/missing_header.patch
@@ -0,0 +1,12 @@
+Add missing header for EBUSY
+
+--- a/vpx_util/vpx_pthread.h
++++ b/vpx_util/vpx_pthread.h
+@@ -26,6 +26,7 @@ extern "C" {
+ #define NOMINMAX
+ #undef WIN32_LEAN_AND_MEAN
+ #define WIN32_LEAN_AND_MEAN
++#include <errno.h> // NOLINT
+ #include <process.h> // NOLINT
+ #include <stddef.h> // NOLINT
+ #include <windows.h> // NOLINT
diff --git a/media/libvpx/moz.build b/media/libvpx/moz.build
index 582bc6fd5d..635b5d0fdd 100644
--- a/media/libvpx/moz.build
+++ b/media/libvpx/moz.build
@@ -72,7 +72,10 @@ elif CONFIG['TARGET_CPU'] == 'arm':
]
elif CONFIG['TARGET_CPU'] == 'aarch64' and CONFIG['OS_TARGET'] == 'WINNT':
EXPORTS.vpx += files['ARM64_EXPORTS']
- SOURCES += files['ARM64_SOURCES']
+ # Bug 1885585: clang on win/aarch64 cannot compile SVInt8_t type for now.
+ SOURCES += [
+ f for f in files['ARM64_SOURCES'] if not f.endswith('_sve.c')
+ ]
ASFLAGS += [ '-I%s/media/libvpx/config/win/aarch64/' % TOPSRCDIR ]
LOCAL_INCLUDES += [ '/media/libvpx/config/win/aarch64/' ]
SOURCES += [ '/media/libvpx/config/win/aarch64/vpx_config.c' ]
@@ -125,6 +128,10 @@ for f in SOURCES:
SOURCES[f].flags += ['-march=armv8.2-a+dotprod']
if 'neon_i8mm.c' in f:
SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm']
+ if 'sve.c' in f:
+ SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm+sve']
+ if 'sve2.c' in f:
+ SOURCES[f].flags += ['-march=armv9-a+sve2']
# Suppress warnings in third-party code.
CFLAGS += [
diff --git a/media/libvpx/moz.yaml b/media/libvpx/moz.yaml
index 17704a1905..0b3ec52482 100644
--- a/media/libvpx/moz.yaml
+++ b/media/libvpx/moz.yaml
@@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
- release: f6b7166a2b6bac544c2c487d3a7e49bc265cdf9d (Tue Jan 02 20:08:06 2024).
+ release: 7fb8ceccf92c35cd5131b05c0502916715ebc76b (Fri Mar 15 01:11:50 2024).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
- revision: f6b7166a2b6bac544c2c487d3a7e49bc265cdf9d
+ revision: 7fb8ceccf92c35cd5131b05c0502916715ebc76b
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/
@@ -53,8 +53,10 @@ vendoring:
- tools/
patches:
+ - arm_cpu_runtime_detection_code_on_openbsd.patch
- input_frame_validation.patch
- input_frame_validation_vp9.patch
+ - missing_header.patch
update-actions:
- action: move-file
diff --git a/media/libvpx/sources.mozbuild b/media/libvpx/sources.mozbuild
index 2960dee255..1ad5d4447c 100644
--- a/media/libvpx/sources.mozbuild
+++ b/media/libvpx/sources.mozbuild
@@ -934,6 +934,7 @@ files = {
'libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c',
'libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c',
'libvpx/vp9/encoder/arm/neon/vp9_error_neon.c',
+ 'libvpx/vp9/encoder/arm/neon/vp9_error_sve.c',
'libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c',
'libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c',
'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c',
@@ -1006,6 +1007,7 @@ files = {
'libvpx/vpx_dsp/arm/subpel_variance_neon.c',
'libvpx/vpx_dsp/arm/subtract_neon.c',
'libvpx/vpx_dsp/arm/sum_squares_neon.c',
+ 'libvpx/vpx_dsp/arm/sum_squares_sve.c',
'libvpx/vpx_dsp/arm/variance_neon.c',
'libvpx/vpx_dsp/arm/variance_neon_dotprod.c',
'libvpx/vpx_dsp/arm/vpx_convolve8_neon.c',
@@ -1014,8 +1016,6 @@ files = {
'libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c',
'libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c',
'libvpx/vpx_dsp/arm/vpx_convolve_neon.c',
- 'libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c',
- 'libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c',
'libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c',
'libvpx/vpx_dsp/avg.c',
'libvpx/vpx_dsp/bitreader.c',