summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-15 03:35:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-15 03:35:49 +0000
commitd8bbc7858622b6d9c278469aab701ca0b609cddf (patch)
treeeff41dc61d9f714852212739e6b3738b82a2af87 /media/libvpx/libvpx
parentReleasing progress-linux version 125.0.3-1~progress7.99u1. (diff)
downloadfirefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.tar.xz
firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.zip
Merging upstream version 126.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'media/libvpx/libvpx')
-rw-r--r--media/libvpx/libvpx/.mailmap3
-rw-r--r--media/libvpx/libvpx/AUTHORS11
-rw-r--r--media/libvpx/libvpx/CHANGELOG76
-rw-r--r--media/libvpx/libvpx/README40
-rw-r--r--media/libvpx/libvpx/build/make/Android.mk13
-rw-r--r--media/libvpx/libvpx/build/make/Makefile2
-rw-r--r--media/libvpx/libvpx/build/make/configure.sh109
-rwxr-xr-xmedia/libvpx/libvpx/build/make/rtcd.pl2
-rwxr-xr-xmedia/libvpx/libvpx/configure7
-rw-r--r--media/libvpx/libvpx/examples/resize_util.c2
-rw-r--r--media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c9
-rw-r--r--media/libvpx/libvpx/examples/vp9cx_set_ref.c2
-rw-r--r--media/libvpx/libvpx/libs.doxy_template8
-rw-r--r--media/libvpx/libvpx/libs.mk4
-rw-r--r--media/libvpx/libvpx/test/android/get_files.py17
-rw-r--r--media/libvpx/libvpx/test/avg_test.cc9
-rw-r--r--media/libvpx/libvpx/test/codec_factory.h8
-rw-r--r--media/libvpx/libvpx/test/convolve_test.cc86
-rw-r--r--media/libvpx/libvpx/test/encode_api_test.cc418
-rw-r--r--media/libvpx/libvpx/test/frame_size_tests.cc2
-rw-r--r--media/libvpx/libvpx/test/init_vpx_test.cc3
-rw-r--r--media/libvpx/libvpx/test/resize_test.cc10
-rw-r--r--media/libvpx/libvpx/test/sum_squares_test.cc7
-rw-r--r--media/libvpx/libvpx/test/variance_test.cc261
-rw-r--r--media/libvpx/libvpx/test/video_source.h2
-rw-r--r--media/libvpx/libvpx/test/vp8_datarate_test.cc25
-rw-r--r--media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc7
-rw-r--r--media/libvpx/libvpx/test/vp9_block_error_test.cc9
-rw-r--r--media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc987
-rw-r--r--media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc3
-rw-r--r--media/libvpx/libvpx/test/vp9_scale_test.cc9
-rw-r--r--media/libvpx/libvpx/tools_common.c36
-rw-r--r--media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c2
-rw-r--r--media/libvpx/libvpx/vp8/common/entropy.c2
-rw-r--r--media/libvpx/libvpx/vp8/common/generic/systemdependent.c41
-rw-r--r--media/libvpx/libvpx/vp8/common/onyx.h2
-rw-r--r--media/libvpx/libvpx/vp8/common/rtcd.c2
-rw-r--r--media/libvpx/libvpx/vp8/common/threading.h153
-rw-r--r--media/libvpx/libvpx/vp8/decoder/onyxd_if.c2
-rw-r--r--media/libvpx/libvpx/vp8/decoder/onyxd_int.h5
-rw-r--r--media/libvpx/libvpx/vp8/decoder/threading.c33
-rw-r--r--media/libvpx/libvpx/vp8/encoder/encodeframe.c46
-rw-r--r--media/libvpx/libvpx/vp8/encoder/ethreading.c63
-rw-r--r--media/libvpx/libvpx/vp8/encoder/onyx_if.c48
-rw-r--r--media/libvpx/libvpx/vp8/encoder/onyx_int.h9
-rw-r--r--media/libvpx/libvpx/vp8/encoder/ratectrl.c29
-rw-r--r--media/libvpx/libvpx/vp8/encoder/tokenize.h2
-rw-r--r--media/libvpx/libvpx/vp8/vp8_cx_iface.c84
-rw-r--r--media/libvpx/libvpx/vp8/vp8_dx_iface.c2
-rw-r--r--media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc13
-rw-r--r--media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h10
-rw-r--r--media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h1
-rw-r--r--media/libvpx/libvpx/vp9/common/vp9_rtcd.c2
-rw-r--r--media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl6
-rw-r--r--media/libvpx/libvpx/vp9/common/vp9_thread_common.c1
-rw-r--r--media/libvpx/libvpx/vp9/common/vp9_thread_common.h1
-rw-r--r--media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c2
-rw-r--r--media/libvpx/libvpx/vp9/decoder/vp9_decoder.c2
-rw-r--r--media/libvpx/libvpx/vp9/decoder/vp9_decoder.h1
-rw-r--r--media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c1
-rw-r--r--media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h2
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c78
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_block.h2
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c6
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h2
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c107
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encoder.c174
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encoder.h8
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ethread.c13
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ethread.h3
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c52
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h9
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_extend.c39
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_extend.h3
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c114
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c97
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c1
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_quantize.c1
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c81
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h6
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c2
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c74
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h1
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c16
-rw-r--r--media/libvpx/libvpx/vp9/ratectrl_rtc.cc2
-rw-r--r--media/libvpx/libvpx/vp9/ratectrl_rtc.h35
-rw-r--r--media/libvpx/libvpx/vp9/simple_encode.cc12
-rw-r--r--media/libvpx/libvpx/vp9/vp9_cx_iface.c102
-rw-r--r--media/libvpx/libvpx/vp9/vp9_dx_iface.c1
-rw-r--r--media/libvpx/libvpx/vp9/vp9cx.mk1
-rw-r--r--media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h8
-rw-r--r--media/libvpx/libvpx/vpx/src/vpx_encoder.c9
-rw-r--r--media/libvpx/libvpx/vpx/src/vpx_image.c4
-rw-r--r--media/libvpx/libvpx/vpx/src/vpx_tpl.c6
-rw-r--r--media/libvpx/libvpx/vpx/vp8cx.h2
-rw-r--r--media/libvpx/libvpx/vpx/vpx_encoder.h18
-rw-r--r--media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h24
-rw-r--r--media/libvpx/libvpx/vpx/vpx_tpl.h22
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c68
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c344
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c1905
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c351
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c452
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c58
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c2
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h201
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c73
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h72
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c897
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h449
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c1428
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c1250
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c58
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c66
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c66
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h32
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h51
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c445
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk8
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c2
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl124
-rw-r--r--media/libvpx/libvpx/vpx_dsp/vpx_filter.h1
-rw-r--r--media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c21
-rw-r--r--media/libvpx/libvpx/vpx_ports/arm.h2
-rw-r--r--media/libvpx/libvpx/vpx_ports/emms_mmx.c2
-rw-r--r--media/libvpx/libvpx/vpx_ports/mem.h8
-rw-r--r--media/libvpx/libvpx/vpx_ports/vpx_once.h23
-rw-r--r--media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c2
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_pthread.h157
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_thread.c93
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_thread.h366
-rw-r--r--media/libvpx/libvpx/vpx_util/vpx_util.mk1
132 files changed, 6973 insertions, 5994 deletions
diff --git a/media/libvpx/libvpx/.mailmap b/media/libvpx/libvpx/.mailmap
index bb0ddd95b2..7206b5ebec 100644
--- a/media/libvpx/libvpx/.mailmap
+++ b/media/libvpx/libvpx/.mailmap
@@ -20,6 +20,7 @@ Hui Su <huisu@google.com>
Jacky Chen <jackychen@google.com>
Jim Bankoski <jimbankoski@google.com>
Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
Johann <johann@duck.com> <johann.koenig@gmail.com>
@@ -53,4 +54,4 @@ Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
Yaowu Xu <yaowu@google.com> <Yaowu Xu>
Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>
Vitaly Buka <vitalybuka@chromium.org> <vitlaybuka@chromium.org>
-xiwei gu <guxiwei-hf@loongson.cn>
+Xiwei Gu <guxiwei-hf@loongson.cn>
diff --git a/media/libvpx/libvpx/AUTHORS b/media/libvpx/libvpx/AUTHORS
index 2db4a113e4..5515e26589 100644
--- a/media/libvpx/libvpx/AUTHORS
+++ b/media/libvpx/libvpx/AUTHORS
@@ -25,6 +25,7 @@ Andrew Salkeld <andrew.salkeld@arm.com>
Angie Chen <yunqi@google.com>
Angie Chiang <angiebird@google.com>
Anton Venema <anton.venema@liveswitch.com>
+Anupam Pandey <anupam.pandey@ittiam.com>
Aron Rosenberg <arosenberg@logitech.com>
Attila Nagy <attilanagy@google.com>
Birk Magnussen <birk.magnussen@googlemail.com>
@@ -34,6 +35,8 @@ Brion Vibber <bvibber@wikimedia.org>
changjun.yang <changjun.yang@intel.com>
Charles 'Buck' Krasic <ckrasic@google.com>
Cheng Chen <chengchen@google.com>
+Chen Wang <wangchen20@iscas.ac.cn>
+Cherma Rajan A <cherma.rajan@ittiam.com>
Chi Yo Tsai <chiyotsai@google.com>
chm <chm@rock-chips.com>
Chris Cunningham <chcunningham@chromium.org>
@@ -60,6 +63,8 @@ Fritz Koenig <frkoenig@google.com>
Fyodor Kyslov <kyslov@google.com>
Gabriel Marin <gmx@chromium.org>
Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+George Steed <george.steed@arm.com>
+Gerda Zsejke More <gerdazsejke.more@arm.com>
Geza Lore <gezalore@gmail.com>
Ghislain MARY <ghislainmary2@gmail.com>
Giuseppe Scrivano <gscrivano@gnu.org>
@@ -103,6 +108,7 @@ Jin Bo <jinbo@loongson.cn>
Jingning Han <jingning@google.com>
Joel Fernandes <joelaf@google.com>
Joey Parrish <joeyparrish@google.com>
+Johann <johann@duck.com>
Johann Koenig <johannkoenig@google.com>
John Koleszar <jkoleszar@google.com>
Johnny Klonaris <google@jawknee.com>
@@ -120,6 +126,7 @@ KO Myung-Hun <komh@chollian.net>
Konstantinos Margaritis <konma@vectorcamp.gr>
Kyle Siefring <kylesiefring@gmail.com>
Lawrence Velázquez <larryv@macports.org>
+L. E. Segovia <amy@amyspark.me>
Linfeng Zhang <linfengz@google.com>
Liu Peng <pengliu.mail@gmail.com>
Lou Quillio <louquillio@google.com>
@@ -147,6 +154,7 @@ Mirko Bonadei <mbonadei@google.com>
Moriyoshi Koizumi <mozo@mozo.jp>
Morton Jonuschat <yabawock@gmail.com>
Nathan E. Egge <negge@mozilla.com>
+Neeraj Gadgil <neeraj.gadgil@ittiam.com>
Neil Birkbeck <neil.birkbeck@gmail.com>
Nico Weber <thakis@chromium.org>
Niveditha Rau <niveditha.rau@gmail.com>
@@ -213,7 +221,8 @@ Vitaly Buka <vitalybuka@chromium.org>
Vlad Tsyrklevich <vtsyrklevich@chromium.org>
Wan-Teh Chang <wtc@google.com>
Wonkap Jang <wonkap@google.com>
-xiwei gu <guxiwei-hf@loongson.cn>
+Xiahong Bao <xiahong.bao@nxp.com>
+Xiwei Gu <guxiwei-hf@loongson.cn>
Yaowu Xu <yaowu@google.com>
Yi Luo <luoyi@google.com>
Yongzhe Wang <yongzhe@google.com>
diff --git a/media/libvpx/libvpx/CHANGELOG b/media/libvpx/libvpx/CHANGELOG
index 21070785ed..87f0d7f708 100644
--- a/media/libvpx/libvpx/CHANGELOG
+++ b/media/libvpx/libvpx/CHANGELOG
@@ -1,7 +1,79 @@
-20yy-mm-dd v1.14.0 "V Duck"
+2024-01-02 v1.14.0 "Venetian Duck"
This release drops support for old C compilers, such as Visual Studio 2012
and older, that disallow mixing variable declarations and statements (a C99
- feature).
+ feature). It adds support for run-time CPU feature detection for Arm
+ platforms, as well as support for darwin23 (macOS 14).
+
+ - Upgrading:
+ This release is ABI incompatible with the previous release.
+
+ Various new features for rate control library for real-time: SVC parallel
+ encoding, loopfilter level, support for frame dropping, and screen content.
+
+ New callback function send_tpl_gop_stats for vp9 external rate control
+ library, which can be used to transmit TPL stats for a group of pictures. A
+ public header vpx_tpl.h is added for the definition of TPL stats used in
+ this callback.
+
+ libwebm is upgraded to libwebm-1.0.0.29-9-g1930e3c.
+
+ - Enhancement:
+ Improvements on Neon optimizations: VoD: 12-35% speed up for bitdepth 8,
+ 68%-151% speed up for high bitdepth.
+
+ Improvements on AVX2 and SSE optimizations.
+ Improvements on LSX optimizations for LoongArch.
+ 42-49% speedup on speed 0 VoD encoding.
+ Android API level predicates.
+
+ - Bug fixes:
+ Fix to missing prototypes from the rtcd header.
+ Fix to segfault when total size is enlarged but width is smaller.
+ Fix to the build for arm64ec using MSVC.
+ Fix to copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic.
+ Fix to -Wshadow warnings.
+ Fix to heap overflow in vpx_get4x4sse_cs_neon.
+ Fix to buffer overrun in highbd Neon subpel variance filters.
+ Added bitexact encode test script.
+ Fix to -Wl,-z,defs with Clang's sanitizers.
+ Fix to decoder stability after error & continued decoding.
+ Fix to mismatch of VP9 encode with NEON intrinsics with C only version.
+ Fix to Arm64 MSVC compile vpx_highbd_fdct4x4_neon.
+ Fix to fragments count before use.
+ Fix to a case where target bandwidth is 0 for SVC.
+ Fix mask in vp9_quantize_avx2,highbd_get_max_lane_eob.
+ Fix to int overflow in vp9_calc_pframe_target_size_one_pass_cbr.
+ Fix to integer overflow in vp8,ratectrl.c.
+ Fix to integer overflow in vp9 svc.
+ Fix to avg_frame_bandwidth overflow.
+ Fix to per frame qp for temporal layers.
+ Fix to unsigned integer overflow in sse computation.
+ Fix to uninitialized mesh feature for BEST mode.
+ Fix to overflow in highbd temporal_filter.
+ Fix to unaligned loads w/w==4 in vpx_convolve_copy_neon.
+ Skip arm64_neon.h workaround w/VS >= 2019.
+ Fix to c vs avx mismatch of diamond_search_sad().
+ Fix to c vs intrinsic mismatch of vpx_hadamard_32x32() function.
+ Fix to a bug in vpx_hadamard_32x32_neon().
+ Fix to Clang -Wunreachable-code-aggressive warnings.
+ Fix to a bug in vpx_highbd_hadamard_32x32_neon().
+ Fix to -Wunreachable-code in mfqe_partition.
+ Force mode search on 64x64 if no mode is selected.
+ Fix to ubsan failure caused by left shift of negative.
+ Fix to integer overflow in calc_pframe_target_size.
+ Fix to float-cast-overflow in vp8_change_config().
+ Fix to a null ptr before use.
+ Conditionally skip using inter frames in speed features.
+ Remove invalid reference frames.
+ Disable intra mode search speed features conditionally.
+ Set nonrd keyframe under dynamic change of deadline for rtc.
+ Fix to scaled reference offsets.
+ Set skip_recode=0 in nonrd_pick_sb_modes.
+ Fix to an edge case when downsizing to one.
+ Fix to a bug in frame scaling.
+ Fix to pred buffer stride.
+ Fix to a bug in simple motion search.
+ Update frame size in actual encoding.
2023-09-29 v1.13.1 "Ugly Duckling"
This release contains two security related fixes. One each for VP8 and VP9.
diff --git a/media/libvpx/libvpx/README b/media/libvpx/libvpx/README
index 4c25b15d81..6dbd164c34 100644
--- a/media/libvpx/libvpx/README
+++ b/media/libvpx/libvpx/README
@@ -1,5 +1,3 @@
-v1.13.1 Ugly Duckling
-
Welcome to the WebM VP8/VP9 Codec SDK!
COMPILING THE APPLICATIONS/LIBRARIES:
@@ -183,6 +181,44 @@ CODE STYLE:
See also: http://clang.llvm.org/docs/ClangFormat.html
+PROFILE GUIDED OPTIMIZATION (PGO)
+ Profile Guided Optimization can be enabled for Clang builds using the
+ commands:
+
+ $ export CC=clang
+ $ export CXX=clang++
+ $ ../libvpx/configure --enable-profile
+ $ make
+
+ Generate one or multiple PGO profile files by running vpxdec or vpxenc. For
+ example:
+
+ $ ./vpxdec ../vpx/out_ful/vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm \
+ -o - > /dev/null
+
+ To convert and merge the raw profile files, use the llvm-profdata tool:
+
+ $ llvm-profdata merge -o perf.profdata default_8382761441159425451_0.profraw
+
+ Then, rebuild the project with the new profile file:
+
+ $ make clean
+ $ ../libvpx/configure --use-profile=perf.profdata
+ $ make
+
+ Note: Always use the llvm-profdata from the toolchain that is used for
+ compiling the PGO-enabled binary.
+
+ To observe the improvements from a PGO-enabled build, enable and compare the
+ list of failed optimizations by using the -Rpass-missed compiler flag. For
+ example, to list the failed loop vectorizations:
+
+ $ ../libvpx/configure --use-profile=perf.profdata \
+ --extra-cflags=-Rpass-missed=loop-vectorize
+
+ For guidance on utilizing PGO files to identify potential optimization
+ opportunities, see: tools/README.pgo.md
+
SUPPORT
This library is an open source project supported by its community. Please
email webm-discuss@webmproject.org for help.
diff --git a/media/libvpx/libvpx/build/make/Android.mk b/media/libvpx/libvpx/build/make/Android.mk
index ba24f541b1..533f43c1c2 100644
--- a/media/libvpx/libvpx/build/make/Android.mk
+++ b/media/libvpx/libvpx/build/make/Android.mk
@@ -15,13 +15,9 @@ ifdef NDK_ROOT
# In an Android project place a libvpx checkout in the jni directory.
# Run the configure script from the jni directory. Base libvpx
# encoder/decoder configuration will look similar to:
-# ./libvpx/configure --target=armv7-android-gcc --disable-examples \
+# ./libvpx/configure --target=arm64-android-gcc --disable-examples \
# --enable-external-build
#
-# When targeting Android, realtime-only is enabled by default. This can
-# be overridden by adding the command line flag:
-# --disable-realtime-only
-#
# This will create .mk files that contain variables that contain the
# source files to compile.
#
@@ -38,11 +34,14 @@ ifdef NDK_ROOT
# but the resulting library *must* be run on devices supporting all of the
# enabled extensions. They can be disabled individually with
# --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512}
-# --disable-neon[-asm]
+# --disable-neon{, -asm, -neon-dotprod, -neon-i8mm}
+# --disable-sve
# --disable-{dspr2, msa}
#
-# Running ndk-build will build libvpx and include it in your project.
+# Running ndk-build will build libvpx and include it in your project. Set
+# APP_ABI to match the --target passed to configure:
+# https://developer.android.com/ndk/guides/application_mk#app_abi.
#
CONFIG_DIR := $(LOCAL_PATH)/
diff --git a/media/libvpx/libvpx/build/make/Makefile b/media/libvpx/libvpx/build/make/Makefile
index 199ed78058..658b37617b 100644
--- a/media/libvpx/libvpx/build/make/Makefile
+++ b/media/libvpx/libvpx/build/make/Makefile
@@ -150,6 +150,8 @@ $(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm
$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm
$(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
$(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+$(BUILD_PFX)%_sve2.c.d: CFLAGS += -march=armv9-a+sve2
+$(BUILD_PFX)%_sve2.c.o: CFLAGS += -march=armv9-a+sve2
# POWER
$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
diff --git a/media/libvpx/libvpx/build/make/configure.sh b/media/libvpx/libvpx/build/make/configure.sh
index 869793a296..009bf7db5c 100644
--- a/media/libvpx/libvpx/build/make/configure.sh
+++ b/media/libvpx/libvpx/build/make/configure.sh
@@ -74,6 +74,8 @@ Build options:
--cpu=CPU optimize for a specific cpu rather than a family
--extra-cflags=ECFLAGS add ECFLAGS to CFLAGS [$CFLAGS]
--extra-cxxflags=ECXXFLAGS add ECXXFLAGS to CXXFLAGS [$CXXFLAGS]
+ --use-profile=PROFILE_FILE
+ Use PROFILE_FILE for PGO
${toggle_extra_warnings} emit harmless warnings (always non-fatal)
${toggle_werror} treat warnings as errors, if possible
(not available with all compilers)
@@ -81,6 +83,7 @@ Build options:
${toggle_pic} turn on/off Position Independent Code
${toggle_ccache} turn on/off compiler cache
${toggle_debug} enable/disable debug mode
+ ${toggle_profile} enable/disable profiling
${toggle_gprof} enable/disable gprof profiling instrumentation
${toggle_gcov} enable/disable gcov coverage instrumentation
${toggle_thumb} enable/disable building arm assembly in thumb mode
@@ -429,6 +432,26 @@ check_gcc_machine_options() {
fi
}
+check_neon_sve_bridge_compiles() {
+ if enabled sve; then
+ check_cc -march=armv8.2-a+dotprod+i8mm+sve <<EOF
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+EOF
+ compile_result=$?
+ if [ ${compile_result} -ne 0 ]; then
+ log_echo " disabling sve: arm_neon_sve_bridge.h not supported by compiler"
+ log_echo " disabling sve2: arm_neon_sve_bridge.h not supported by compiler"
+ disable_feature sve
+ disable_feature sve2
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sve --disable-sve2 "
+ fi
+ fi
+}
+
check_gcc_avx512_compiles() {
if disabled gcc; then
return
@@ -611,6 +634,9 @@ process_common_cmdline() {
--extra-cxxflags=*)
extra_cxxflags="${optval}"
;;
+ --use-profile=*)
+ pgo_file=${optval}
+ ;;
--enable-?*|--disable-?*)
eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
if is_in ${option} ${ARCH_EXT_LIST}; then
@@ -951,7 +977,7 @@ EOF
add_cflags "-mmacosx-version-min=10.15"
add_ldflags "-mmacosx-version-min=10.15"
;;
- *-darwin2[0-2]-*)
+ *-darwin2[0-3]-*)
add_cflags "-arch ${toolchain%%-*}"
add_ldflags "-arch ${toolchain%%-*}"
;;
@@ -980,36 +1006,18 @@ EOF
case ${toolchain} in
arm*)
soft_enable runtime_cpu_detect
- # Arm ISA extensions are treated as supersets.
- case ${tgt_isa} in
- arm64|armv8)
- for ext in ${ARCH_EXT_LIST_AARCH64}; do
- # Disable higher order extensions to simplify dependencies.
- if [ "$disable_exts" = "yes" ]; then
- if ! disabled $ext; then
- RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
- disable_feature $ext
- fi
- elif disabled $ext; then
- disable_exts="yes"
- else
- soft_enable $ext
- fi
- done
- ;;
- armv7|armv7s)
- soft_enable neon
- # Only enable neon_asm when neon is also enabled.
- enabled neon && soft_enable neon_asm
- # If someone tries to force it through, die.
- if disabled neon && enabled neon_asm; then
- die "Disabling neon while keeping neon-asm is not supported"
- fi
- ;;
- esac
- asm_conversion_cmd="cat"
+ if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
+ soft_enable neon
+ # Only enable neon_asm when neon is also enabled.
+ enabled neon && soft_enable neon_asm
+ # If someone tries to force it through, die.
+ if disabled neon && enabled neon_asm; then
+ die "Disabling neon while keeping neon-asm is not supported"
+ fi
+ fi
+ asm_conversion_cmd="cat"
case ${tgt_cc} in
gcc)
link_with_cc=gcc
@@ -1228,6 +1236,38 @@ EOF
fi
;;
esac
+
+ # AArch64 ISA extensions are treated as supersets.
+ if [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then
+ aarch64_arch_flag_neon="arch=armv8-a"
+ aarch64_arch_flag_neon_dotprod="arch=armv8.2-a+dotprod"
+ aarch64_arch_flag_neon_i8mm="arch=armv8.2-a+dotprod+i8mm"
+ aarch64_arch_flag_sve="arch=armv8.2-a+dotprod+i8mm+sve"
+ aarch64_arch_flag_sve2="arch=armv9-a+sve2"
+ for ext in ${ARCH_EXT_LIST_AARCH64}; do
+ if [ "$disable_exts" = "yes" ]; then
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+ soft_disable $ext
+ else
+ # Check the compiler supports the -march flag for the extension.
+ # This needs to happen after toolchain/OS inspection so we handle
+ # $CROSS etc correctly when checking for flags, else these will
+ # always fail.
+ flag="$(eval echo \$"aarch64_arch_flag_${ext}")"
+ check_gcc_machine_option "${flag}" "${ext}"
+ if ! enabled $ext; then
+ # Disable higher order extensions to simplify dependencies.
+ disable_exts="yes"
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+ soft_disable $ext
+ fi
+ fi
+ done
+ if enabled sve; then
+ check_neon_sve_bridge_compiles
+ fi
+ fi
+
;;
mips*)
link_with_cc=gcc
@@ -1484,6 +1524,14 @@ EOF
;;
esac
+ # Enable PGO
+ if [ -n "${pgo_file}" ]; then
+ check_add_cflags -fprofile-use=${pgo_file} || \
+ die "-fprofile-use is not supported by compiler"
+ check_add_ldflags -fprofile-use=${pgo_file} || \
+ die "-fprofile-use is not supported by linker"
+ fi
+
# Try to enable CPU specific tuning
if [ -n "${tune_cpu}" ]; then
if [ -n "${tune_cflags}" ]; then
@@ -1504,6 +1552,9 @@ EOF
else
check_add_cflags -DNDEBUG
fi
+ enabled profile &&
+ check_add_cflags -fprofile-generate &&
+ check_add_ldflags -fprofile-generate
enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
enabled gcov &&
diff --git a/media/libvpx/libvpx/build/make/rtcd.pl b/media/libvpx/libvpx/build/make/rtcd.pl
index 0b9e16738e..025238d678 100755
--- a/media/libvpx/libvpx/build/make/rtcd.pl
+++ b/media/libvpx/libvpx/build/make/rtcd.pl
@@ -487,7 +487,7 @@ if ($opts{arch} eq 'x86') {
@ALL_ARCHS = filter(qw/neon_asm neon/);
arm;
} elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
- @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/);
+ @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve sve2/);
@REQUIRES = filter(qw/neon/);
&require(@REQUIRES);
arm;
diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure
index b212e0709d..97e78996e8 100755
--- a/media/libvpx/libvpx/configure
+++ b/media/libvpx/libvpx/configure
@@ -260,6 +260,7 @@ ARCH_EXT_LIST_AARCH64="
neon_dotprod
neon_i8mm
sve
+ sve2
"
ARCH_EXT_LIST_X86="
@@ -376,6 +377,7 @@ CMDLINE_SELECT="
install_libs
install_srcs
debug
+ profile
gprof
gcov
pic
@@ -659,6 +661,7 @@ process_toolchain() {
check_add_cflags -Wmissing-declarations
check_add_cflags -Wmissing-prototypes
check_add_cflags -Wshadow
+ check_add_cflags -Wstrict-prototypes
check_add_cflags -Wuninitialized
check_add_cflags -Wunreachable-code-aggressive
check_add_cflags -Wunused
@@ -677,6 +680,10 @@ process_toolchain() {
# would be needed to apply this only to test/*.cc.
check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
+ # Do not allow implicit vector type conversions on Clang builds (this
+ # is already the default on GCC builds).
+ check_add_cflags -flax-vector-conversions=none
+
# Quiet gcc 6 vs 7 abi warnings:
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
if enabled arm; then
diff --git a/media/libvpx/libvpx/examples/resize_util.c b/media/libvpx/libvpx/examples/resize_util.c
index 5fb63e1660..083bd2519d 100644
--- a/media/libvpx/libvpx/examples/resize_util.c
+++ b/media/libvpx/libvpx/examples/resize_util.c
@@ -20,7 +20,7 @@
static const char *exec_name = NULL;
-static void usage() {
+static void usage(void) {
printf("Usage:\n");
printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
exec_name);
diff --git a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
index 998e4fb20d..4050c093cd 100644
--- a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -1156,12 +1156,13 @@ int main(int argc, const char **argv) {
#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id);
// Don't look for mismatch on top spatial and top temporal layers as they
- // are non reference frames.
+ // are non reference frames. Don't look at frames whose top spatial layer
+ // is dropped.
if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) &&
+ cx_pkt->data.frame
+ .spatial_layer_encoded[enc_cfg.ss_number_layers - 1] &&
!(layer_id.temporal_layer_id > 0 &&
- layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1 &&
- cx_pkt->data.frame
- .spatial_layer_encoded[enc_cfg.ss_number_layers - 1])) {
+ layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1)) {
test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen);
}
#endif
diff --git a/media/libvpx/libvpx/examples/vp9cx_set_ref.c b/media/libvpx/libvpx/examples/vp9cx_set_ref.c
index 1a0823153b..6e12d668b0 100644
--- a/media/libvpx/libvpx/examples/vp9cx_set_ref.c
+++ b/media/libvpx/libvpx/examples/vp9cx_set_ref.c
@@ -60,7 +60,7 @@
static const char *exec_name;
-void usage_exit() {
+void usage_exit(void) {
fprintf(stderr,
"Usage: %s <width> <height> <infile> <outfile> "
"<frame> <limit(optional)>\n",
diff --git a/media/libvpx/libvpx/libs.doxy_template b/media/libvpx/libvpx/libs.doxy_template
index 1ee442af3e..6d05162d00 100644
--- a/media/libvpx/libvpx/libs.doxy_template
+++ b/media/libvpx/libvpx/libs.doxy_template
@@ -1223,14 +1223,6 @@ DOT_GRAPH_MAX_NODES = 50
MAX_DOT_GRAPH_DEPTH = 0
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, which results in a white background.
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-
-DOT_TRANSPARENT = YES
-
# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
# files in one run (i.e. multiple -o and -T options on the command line). This
# makes dot run faster, but since only newer versions of dot (>1.8.10)
diff --git a/media/libvpx/libvpx/libs.mk b/media/libvpx/libvpx/libs.mk
index ff1c569c3b..5964386710 100644
--- a/media/libvpx/libvpx/libs.mk
+++ b/media/libvpx/libvpx/libs.mk
@@ -313,9 +313,9 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
# SO_VERSION_* then follow the rules in the link to detemine the new version
# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
-SO_VERSION_MAJOR := 8
+SO_VERSION_MAJOR := 9
SO_VERSION_MINOR := 0
-SO_VERSION_PATCH := 1
+SO_VERSION_PATCH := 0
ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib
SHARED_LIB_SUF := .dylib
diff --git a/media/libvpx/libvpx/test/android/get_files.py b/media/libvpx/libvpx/test/android/get_files.py
index 1c69740d2b..98ce7b1947 100644
--- a/media/libvpx/libvpx/test/android/get_files.py
+++ b/media/libvpx/libvpx/test/android/get_files.py
@@ -38,7 +38,7 @@ def get_file_sha(filename):
buf = file.read(HASH_CHUNK)
return sha_hash.hexdigest()
except IOError:
- print "Error reading " + filename
+ print("Error reading " + filename)
# Downloads a file from a url, and then checks the sha against the passed
# in sha
@@ -67,7 +67,7 @@ try:
getopt.getopt(sys.argv[1:], \
"u:i:o:", ["url=", "input_csv=", "output_dir="])
except:
- print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
+ print('get_files.py -u <url> -i <input_csv> -o <output_dir>')
sys.exit(2)
for opt, arg in opts:
@@ -79,7 +79,7 @@ for opt, arg in opts:
local_resource_path = os.path.join(arg)
if len(sys.argv) != 7:
- print "Expects two paths and a url!"
+ print("Expects two paths and a url!")
exit(1)
if not os.path.isdir(local_resource_path):
@@ -89,7 +89,7 @@ file_list_csv = open(file_list_path, "rb")
# Our 'csv' file uses multiple spaces as a delimiter, python's
# csv class only uses single character delimiters, so we convert them below
-file_list_reader = csv.reader((re.sub(' +', ' ', line) \
+file_list_reader = csv.reader((re.sub(' +', ' ', line.decode('utf-8')) \
for line in file_list_csv), delimiter = ' ')
file_shas = []
@@ -104,15 +104,16 @@ for row in file_list_reader:
file_list_csv.close()
# Download files, only if they don't already exist and have correct shas
-for filename, sha in itertools.izip(file_names, file_shas):
+for filename, sha in zip(file_names, file_shas):
+ filename = filename.lstrip('*')
path = os.path.join(local_resource_path, filename)
if os.path.isfile(path) \
and get_file_sha(path) == sha:
- print path + ' exists, skipping'
+ print(path + ' exists, skipping')
continue
for retry in range(0, ftp_retries):
- print "Downloading " + path
+ print("Downloading " + path)
if not download_and_check_sha(url, filename, sha):
- print "Sha does not match, retrying..."
+ print("Sha does not match, retrying...")
else:
break
diff --git a/media/libvpx/libvpx/test/avg_test.cc b/media/libvpx/libvpx/test/avg_test.cc
index ede9c0ba8c..7816912ff7 100644
--- a/media/libvpx/libvpx/test/avg_test.cc
+++ b/media/libvpx/libvpx/test/avg_test.cc
@@ -719,6 +719,15 @@ INSTANTIATE_TEST_SUITE_P(
make_tuple(1024, &vp9_block_error_fp_neon)));
#endif // HAVE_NEON
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, BlockErrorTestFP,
+ ::testing::Values(make_tuple(16, &vp9_block_error_fp_sve),
+ make_tuple(64, &vp9_block_error_fp_sve),
+ make_tuple(256, &vp9_block_error_fp_sve),
+ make_tuple(1024, &vp9_block_error_fp_sve)));
+#endif // HAVE_SVE
+
#if HAVE_MSA
INSTANTIATE_TEST_SUITE_P(
MSA, AverageTest,
diff --git a/media/libvpx/libvpx/test/codec_factory.h b/media/libvpx/libvpx/test/codec_factory.h
index c7e8f54847..179ccdf011 100644
--- a/media/libvpx/libvpx/test/codec_factory.h
+++ b/media/libvpx/libvpx/test/codec_factory.h
@@ -164,7 +164,9 @@ const libvpx_test::VP8CodecFactory kVP8;
&libvpx_test::kVP8)), \
__VA_ARGS__))
#else
-#define VP8_INSTANTIATE_TEST_SUITE(test, ...)
+// static_assert() is used to avoid warnings about an extra ';' outside of a
+// function.
+#define VP8_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP8 == 0, "")
#endif // CONFIG_VP8
/*
@@ -259,7 +261,9 @@ const libvpx_test::VP9CodecFactory kVP9;
&libvpx_test::kVP9)), \
__VA_ARGS__))
#else
-#define VP9_INSTANTIATE_TEST_SUITE(test, ...)
+// static_assert() is used to avoid warnings about an extra ';' outside of a
+// function.
+#define VP9_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP9 == 0, "")
#endif // CONFIG_VP9
} // namespace libvpx_test
diff --git a/media/libvpx/libvpx/test/convolve_test.cc b/media/libvpx/libvpx/test/convolve_test.cc
index ffd5c41c63..11f7625137 100644
--- a/media/libvpx/libvpx/test/convolve_test.cc
+++ b/media/libvpx/libvpx/test/convolve_test.cc
@@ -1218,6 +1218,24 @@ WRAP(convolve8_neon, 12)
WRAP(convolve8_avg_neon, 12)
#endif // HAVE_NEON
+#if HAVE_SVE
+WRAP(convolve8_horiz_sve, 8)
+WRAP(convolve8_avg_horiz_sve, 8)
+WRAP(convolve8_horiz_sve, 10)
+WRAP(convolve8_avg_horiz_sve, 10)
+WRAP(convolve8_horiz_sve, 12)
+WRAP(convolve8_avg_horiz_sve, 12)
+#endif // HAVE_SVE
+
+#if HAVE_SVE2
+WRAP(convolve8_vert_sve2, 8)
+WRAP(convolve8_avg_vert_sve2, 8)
+WRAP(convolve8_vert_sve2, 10)
+WRAP(convolve8_avg_vert_sve2, 10)
+WRAP(convolve8_vert_sve2, 12)
+WRAP(convolve8_avg_vert_sve2, 12)
+#endif // HAVE_SVE2
+
WRAP(convolve_copy_c, 8)
WRAP(convolve_avg_c, 8)
WRAP(convolve8_horiz_c, 8)
@@ -1438,6 +1456,74 @@ INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest,
::testing::ValuesIn(kArrayConvolve_neon_dotprod));
#endif // HAVE_NEON_DOTPROD
+#if HAVE_SVE
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sve(
+ wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_sve_8,
+ wrap_convolve8_avg_horiz_sve_8, wrap_convolve8_vert_c_8,
+ wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+ wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+ wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
+ wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_sve(
+ wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+ wrap_convolve8_horiz_sve_10, wrap_convolve8_avg_horiz_sve_10,
+ wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
+ wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
+ wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+ wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+ 10);
+const ConvolveFunctions convolve12_sve(
+ wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+ wrap_convolve8_horiz_sve_12, wrap_convolve8_avg_horiz_sve_12,
+ wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
+ wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
+ wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+ wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+ 12);
+
+const ConvolveParam kArrayConvolve_sve[] = { ALL_SIZES(convolve8_sve),
+ ALL_SIZES(convolve10_sve),
+ ALL_SIZES(convolve12_sve) };
+INSTANTIATE_TEST_SUITE_P(SVE, ConvolveTest,
+ ::testing::ValuesIn(kArrayConvolve_sve));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_SVE
+
+#if HAVE_SVE2
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sve2(
+ wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8,
+ wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_sve2_8,
+ wrap_convolve8_avg_vert_sve2_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+ wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+ wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
+ wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_sve2(
+ wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10,
+ wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_sve2_10,
+ wrap_convolve8_avg_vert_sve2_10, wrap_convolve8_c_10,
+ wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
+ wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+ wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+ 10);
+const ConvolveFunctions convolve12_sve2(
+ wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12,
+ wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_sve2_12,
+ wrap_convolve8_avg_vert_sve2_12, wrap_convolve8_c_12,
+ wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
+ wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+ wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+ 12);
+
+const ConvolveParam kArrayConvolve_sve2[] = { ALL_SIZES(convolve8_sve2),
+ ALL_SIZES(convolve10_sve2),
+ ALL_SIZES(convolve12_sve2) };
+INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest,
+ ::testing::ValuesIn(kArrayConvolve_sve2));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_SVE2
+
#if HAVE_NEON_I8MM
const ConvolveFunctions convolve8_neon_i8mm(
vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm,
diff --git a/media/libvpx/libvpx/test/encode_api_test.cc b/media/libvpx/libvpx/test/encode_api_test.cc
index 508083673a..ca3b17a5d5 100644
--- a/media/libvpx/libvpx/test/encode_api_test.cc
+++ b/media/libvpx/libvpx/test/encode_api_test.cc
@@ -8,7 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <cassert>
#include <climits>
+#include <cstdint>
#include <cstring>
#include <initializer_list>
#include <new>
@@ -44,6 +46,49 @@ bool IsVP9(vpx_codec_iface_t *iface) {
0;
}
+void *Memset16(void *dest, int val, size_t length) {
+ uint16_t *dest16 = reinterpret_cast<uint16_t *>(dest);
+ for (size_t i = 0; i < length; i++) {
+ *dest16++ = val;
+ }
+ return dest;
+}
+
+vpx_image_t *CreateImage(vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt,
+ unsigned int width, unsigned int height) {
+ assert(fmt != VPX_IMG_FMT_NV12);
+ if (bit_depth > VPX_BITS_8) {
+ fmt = static_cast<vpx_img_fmt_t>(fmt | VPX_IMG_FMT_HIGHBITDEPTH);
+ }
+ vpx_image_t *image = vpx_img_alloc(nullptr, fmt, width, height, 1);
+ if (!image) return image;
+
+ const int val = 1 << (bit_depth - 1);
+ const unsigned int uv_h =
+ (image->d_h + image->y_chroma_shift) >> image->y_chroma_shift;
+ const unsigned int uv_w =
+ (image->d_w + image->x_chroma_shift) >> image->x_chroma_shift;
+ if (bit_depth > VPX_BITS_8) {
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ Memset16(image->planes[0] + i * image->stride[0], val, image->d_w);
+ }
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ Memset16(image->planes[1] + i * image->stride[1], val, uv_w);
+ Memset16(image->planes[2] + i * image->stride[2], val, uv_w);
+ }
+ } else {
+ for (unsigned int i = 0; i < image->d_h; ++i) {
+ memset(image->planes[0] + i * image->stride[0], val, image->d_w);
+ }
+ for (unsigned int i = 0; i < uv_h; ++i) {
+ memset(image->planes[1] + i * image->stride[1], val, uv_w);
+ memset(image->planes[2] + i * image->stride[2], val, uv_w);
+ }
+ }
+
+ return image;
+}
+
TEST(EncodeAPI, InvalidParams) {
uint8_t buf[1] = { 0 };
vpx_image_t img;
@@ -198,7 +243,51 @@ TEST(EncodeAPI, RandomPixelsVp8) {
ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
// Generate random frame data and encode
- uint8_t img[1280 * 720 * 3 / 2];
+ libvpx_test::RandomVideoSource video;
+ video.SetSize(cfg.g_w, cfg.g_h);
+ video.SetImageFormat(VPX_IMG_FMT_I420);
+ video.Begin();
+ ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(),
+ /*flags=*/0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_OK);
+
+ // Destroy libvpx encoder
+ vpx_codec_destroy(&enc);
+}
+
+TEST(EncodeAPI, ChangeToL1T3AndSetBitrateVp8) {
+ // Initialize libvpx encoder
+ vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+ vpx_codec_enc_cfg_t cfg;
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+ cfg.g_threads = 1;
+ cfg.g_profile = 0;
+ cfg.g_w = 1;
+ cfg.g_h = 64;
+ cfg.g_bit_depth = VPX_BITS_8;
+ cfg.g_input_bit_depth = 8;
+ cfg.g_timebase.num = 1;
+ cfg.g_timebase.den = 1000000;
+ cfg.g_pass = VPX_RC_ONE_PASS;
+ cfg.g_lag_in_frames = 0;
+ cfg.rc_dropframe_thresh = 0; // Don't drop frames
+ cfg.rc_resize_allowed = 0;
+ cfg.rc_end_usage = VPX_VBR;
+ cfg.rc_target_bitrate = 10;
+ cfg.rc_min_quantizer = 2;
+ cfg.rc_max_quantizer = 58;
+ cfg.kf_mode = VPX_KF_AUTO;
+ cfg.kf_min_dist = 0;
+ cfg.kf_max_dist = 10000;
+
+ vpx_codec_ctx_t enc;
+ ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+ ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -6), VPX_CODEC_OK);
+
+ // Generate random frame data and encode
+ uint8_t img[1 * 64 * 3 / 2];
libvpx_test::ACMRandom rng;
for (size_t i = 0; i < sizeof(img); ++i) {
img[i] = rng.Rand8();
@@ -207,13 +296,142 @@ TEST(EncodeAPI, RandomPixelsVp8) {
ASSERT_EQ(
vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img),
&img_wrapper);
- ASSERT_EQ(vpx_codec_encode(&enc, &img_wrapper, 0, 1, 0, VPX_DL_BEST_QUALITY),
+ vpx_enc_frame_flags_t flags = VPX_EFLAG_FORCE_KF;
+ ASSERT_EQ(
+ vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME),
+ VPX_CODEC_OK);
+ ASSERT_EQ(vpx_codec_encode(&enc, nullptr, -1, 0, 0, 0), VPX_CODEC_OK);
+
+ cfg.rc_target_bitrate = 4294967;
+ // Set the scalability mode to L1T3.
+ cfg.ts_number_layers = 3;
+ cfg.ts_periodicity = 4;
+ cfg.ts_layer_id[0] = 0;
+ cfg.ts_layer_id[1] = 2;
+ cfg.ts_layer_id[2] = 1;
+ cfg.ts_layer_id[3] = 2;
+ cfg.ts_rate_decimator[0] = 4;
+ cfg.ts_rate_decimator[1] = 2;
+ cfg.ts_rate_decimator[2] = 1;
+ // Bitrate allocation L0: 50% L1: 20% L2: 30%
+ cfg.layer_target_bitrate[0] = cfg.ts_target_bitrate[0] =
+ 50 * cfg.rc_target_bitrate / 100;
+ cfg.layer_target_bitrate[1] = cfg.ts_target_bitrate[1] =
+ 70 * cfg.rc_target_bitrate / 100;
+ cfg.layer_target_bitrate[2] = cfg.ts_target_bitrate[2] =
+ cfg.rc_target_bitrate;
+ cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+ cfg.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT;
+ ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK);
+
+ ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TEMPORAL_LAYER_ID, 2),
VPX_CODEC_OK);
+ constexpr vpx_enc_frame_flags_t VP8_UPDATE_NOTHING =
+ VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+ // Layer 2: only reference last frame, no updates
+ // It only depends on layer 0
+ flags = VP8_UPDATE_NOTHING | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_GF;
+ ASSERT_EQ(
+ vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME),
+ VPX_CODEC_OK);
+
// Destroy libvpx encoder
vpx_codec_destroy(&enc);
}
-#endif
+
+// Emulates the WebCodecs VideoEncoder interface.
+class VP8Encoder {
+ public:
+ explicit VP8Encoder(int speed) : speed_(speed) {}
+ ~VP8Encoder();
+
+ void Configure(unsigned int threads, unsigned int width, unsigned int height,
+ vpx_rc_mode end_usage, vpx_enc_deadline_t deadline);
+ void Encode(bool key_frame);
+
+ private:
+ const int speed_;
+ bool initialized_ = false;
+ vpx_codec_enc_cfg_t cfg_;
+ vpx_codec_ctx_t enc_;
+ int frame_index_ = 0;
+ vpx_enc_deadline_t deadline_ = 0;
+};
+
+VP8Encoder::~VP8Encoder() {
+ if (initialized_) {
+ EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK);
+ }
+}
+
+void VP8Encoder::Configure(unsigned int threads, unsigned int width,
+ unsigned int height, vpx_rc_mode end_usage,
+ vpx_enc_deadline_t deadline) {
+ deadline_ = deadline;
+
+ if (!initialized_) {
+ vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
+ VPX_CODEC_OK);
+ cfg_.g_threads = threads;
+ cfg_.g_w = width;
+ cfg_.g_h = height;
+ cfg_.g_timebase.num = 1;
+ cfg_.g_timebase.den = 1000 * 1000; // microseconds
+ cfg_.g_pass = VPX_RC_ONE_PASS;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.rc_end_usage = end_usage;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 58;
+ ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+ ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
+ initialized_ = true;
+ return;
+ }
+
+ cfg_.g_threads = threads;
+ cfg_.g_w = width;
+ cfg_.g_h = height;
+ cfg_.rc_end_usage = end_usage;
+ ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK)
+ << vpx_codec_error_detail(&enc_);
+}
+
+void VP8Encoder::Encode(bool key_frame) {
+ const vpx_codec_cx_pkt_t *pkt;
+ vpx_image_t *image =
+ CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg_.g_w, cfg_.g_h);
+ ASSERT_NE(image, nullptr);
+ const vpx_enc_frame_flags_t flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
+ ASSERT_EQ(vpx_codec_encode(&enc_, image, frame_index_, 1, flags, deadline_),
+ VPX_CODEC_OK);
+ ++frame_index_;
+ vpx_codec_iter_t iter = nullptr;
+ while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+ ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+ if (key_frame) {
+ ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY);
+ }
+ }
+ vpx_img_free(image);
+}
+
+// This is the reproducer testcase for crbug.com/324459561. However,
+// just running this test is not enough to reproduce the bug. We also
+// need to send signals to the test.
+TEST(EncodeAPI, Chromium324459561) {
+ VP8Encoder encoder(-12);
+
+ encoder.Configure(11, 1685, 652, VPX_CBR, VPX_DL_REALTIME);
+
+ encoder.Encode(true);
+ encoder.Encode(true);
+ encoder.Encode(true);
+
+ encoder.Configure(0, 1685, 1, VPX_VBR, VPX_DL_REALTIME);
+}
+#endif // CONFIG_VP8_ENCODER
// Set up 2 spatial streams with 2 temporal layers per stream, and generate
// invalid configuration by setting the temporal layer rate allocation
@@ -499,6 +717,131 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
}
}
+TEST(EncodeAPI, ConfigResizeBiggerAfterInit) {
+ for (const auto *iface : kCodecIfaces) {
+ SCOPED_TRACE(vpx_codec_iface_name(iface));
+ vpx_codec_enc_cfg_t cfg;
+ vpx_codec_ctx_t enc;
+
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+ EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg));
+
+ cfg.g_w = 1920;
+ cfg.g_h = 1;
+ EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+ IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+ EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+ }
+}
+
+TEST(EncodeAPI, ConfigResizeBiggerAfterEncode) {
+ for (const auto *iface : kCodecIfaces) {
+ SCOPED_TRACE(vpx_codec_iface_name(iface));
+ vpx_codec_enc_cfg_t cfg;
+ vpx_codec_ctx_t enc;
+
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+ EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg));
+ EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc));
+
+ cfg.g_w = 1920;
+ cfg.g_h = 1;
+ EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+ IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+ cfg.g_w = 1920;
+ cfg.g_h = 1080;
+ EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+ IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+ EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+ }
+}
+
+TEST(EncodeAPI, PtsSmallerThanInitialPts) {
+ for (const auto *iface : kCodecIfaces) {
+ // Initialize libvpx encoder.
+ vpx_codec_ctx_t enc;
+ vpx_codec_enc_cfg_t cfg;
+
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+ ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+ // Create input image.
+ vpx_image_t *const image =
+ CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+ ASSERT_NE(image, nullptr);
+
+ // Encode frame.
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 12, 1, 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_OK);
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 13, 1, 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_OK);
+ // pts (10) is smaller than the initial pts (12).
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 10, 1, 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+
+ // Free resources.
+ vpx_img_free(image);
+ ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+ }
+}
+
+TEST(EncodeAPI, PtsOrDurationTooBig) {
+ for (const auto *iface : kCodecIfaces) {
+ // Initialize libvpx encoder.
+ vpx_codec_ctx_t enc;
+ vpx_codec_enc_cfg_t cfg;
+
+ ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+ ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+ // Create input image.
+ vpx_image_t *const image =
+ CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+ ASSERT_NE(image, nullptr);
+
+ // Encode frame.
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_OK);
+#if ULONG_MAX > INT64_MAX
+ // duration is too big.
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, 2),
+ VPX_CODEC_INVALID_PARAM);
+#endif
+ // pts, when converted to ticks, is too big.
+ ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000 + 1, 1, 0,
+ VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+#if ULONG_MAX > INT64_MAX
+ // duration is too big.
+ ASSERT_EQ(
+ vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+ // pts + duration is too big.
+ ASSERT_EQ(
+ vpx_codec_encode(&enc, image, 1, INT64_MAX, 0, VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+#endif
+ // pts + duration, when converted to ticks, is too big.
+#if ULONG_MAX > INT64_MAX
+ ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 0xbd6b566b15c7, 0,
+ VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+#endif
+ ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000, 1, 0,
+ VPX_DL_BEST_QUALITY),
+ VPX_CODEC_INVALID_PARAM);
+
+ // Free resources.
+ vpx_img_free(image);
+ ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+ }
+}
+
#if CONFIG_VP9_ENCODER
// Frame size needed to trigger the overflow exceeds the max buffer allowed on
// 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY
@@ -528,28 +871,16 @@ TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
}
#endif // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
-vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) {
- vpx_image_t *image =
- vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1);
- if (!image) return image;
-
- for (unsigned int i = 0; i < image->d_h; ++i) {
- memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
- }
- const unsigned int uv_h = (image->d_h + 1) / 2;
- const unsigned int uv_w = (image->d_w + 1) / 2;
- for (unsigned int i = 0; i < uv_h; ++i) {
- memset(image->planes[1] + i * image->stride[1], 128, uv_w);
- memset(image->planes[2] + i * image->stride[2], 128, uv_w);
- }
-
- return image;
-}
-
// Emulates the WebCodecs VideoEncoder interface.
class VP9Encoder {
public:
- explicit VP9Encoder(int speed) : speed_(speed) {}
+ explicit VP9Encoder(int speed)
+ : speed_(speed), bit_depth_(VPX_BITS_8), fmt_(VPX_IMG_FMT_I420) {}
+ // The image format `fmt` must not have the VPX_IMG_FMT_HIGHBITDEPTH bit set.
+ // If bit_depth > 8, we will set the VPX_IMG_FMT_HIGHBITDEPTH bit before
+ // passing the image format to vpx_img_alloc().
+ VP9Encoder(int speed, vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt)
+ : speed_(speed), bit_depth_(bit_depth), fmt_(fmt) {}
~VP9Encoder();
void Configure(unsigned int threads, unsigned int width, unsigned int height,
@@ -558,6 +889,8 @@ class VP9Encoder {
private:
const int speed_;
+ const vpx_bit_depth_t bit_depth_;
+ const vpx_img_fmt_t fmt_;
bool initialized_ = false;
vpx_codec_enc_cfg_t cfg_;
vpx_codec_ctx_t enc_;
@@ -577,12 +910,22 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
deadline_ = deadline;
if (!initialized_) {
+ ASSERT_EQ(fmt_ & VPX_IMG_FMT_HIGHBITDEPTH, 0);
+ const bool high_bit_depth = bit_depth_ > VPX_BITS_8;
+ const bool is_420 = fmt_ == VPX_IMG_FMT_I420;
vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
VPX_CODEC_OK);
cfg_.g_threads = threads;
+ // In profiles 0 and 2, only 4:2:0 format is allowed. In profiles 1 and 3,
+ // all other subsampling formats are allowed. In profiles 0 and 1, only bit
+ // depth 8 is allowed. In profiles 2 and 3, only bit depths 10 and 12 are
+ // allowed.
+ cfg_.g_profile = 2 * high_bit_depth + !is_420;
cfg_.g_w = width;
cfg_.g_h = height;
+ cfg_.g_bit_depth = bit_depth_;
+ cfg_.g_input_bit_depth = bit_depth_;
cfg_.g_timebase.num = 1;
cfg_.g_timebase.den = 1000 * 1000; // microseconds
cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -590,7 +933,10 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
cfg_.rc_end_usage = end_usage;
cfg_.rc_min_quantizer = 2;
cfg_.rc_max_quantizer = 58;
- ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+ ASSERT_EQ(
+ vpx_codec_enc_init(&enc_, iface, &cfg_,
+ high_bit_depth ? VPX_CODEC_USE_HIGHBITDEPTH : 0),
+ VPX_CODEC_OK);
ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
initialized_ = true;
return;
@@ -606,13 +952,13 @@ void VP9Encoder::Configure(unsigned int threads, unsigned int width,
void VP9Encoder::Encode(bool key_frame) {
const vpx_codec_cx_pkt_t *pkt;
- vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h);
+ vpx_image_t *image = CreateImage(bit_depth_, fmt_, cfg_.g_w, cfg_.g_h);
ASSERT_NE(image, nullptr);
const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
ASSERT_EQ(
vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_),
VPX_CODEC_OK);
- frame_index_++;
+ ++frame_index_;
vpx_codec_iter_t iter = nullptr;
while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
@@ -944,6 +1290,28 @@ TEST(EncodeAPI, Buganizer311294795) {
encoder.Encode(false);
encoder.Encode(false);
}
+
+TEST(EncodeAPI, Buganizer317105128) {
+ VP9Encoder encoder(-9);
+ encoder.Configure(0, 1, 1, VPX_CBR, VPX_DL_GOOD_QUALITY);
+ encoder.Configure(16, 1920, 1, VPX_CBR, VPX_DL_REALTIME);
+}
+
+TEST(EncodeAPI, Buganizer319964497) {
+ VP9Encoder encoder(7);
+ encoder.Configure(/*threads=*/1, /*width=*/320, /*height=*/240, VPX_VBR,
+ VPX_DL_REALTIME);
+ encoder.Encode(/*key_frame=*/true);
+ encoder.Encode(/*key_frame=*/true);
+ encoder.Encode(/*key_frame=*/false);
+ encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1, VPX_VBR,
+ VPX_DL_REALTIME);
+ encoder.Encode(/*key_frame=*/false);
+ encoder.Configure(/*threads=*/1, /*width=*/2, /*height=*/2, VPX_CBR,
+ VPX_DL_REALTIME);
+ encoder.Encode(/*key_frame=*/false);
+}
+
#endif // CONFIG_VP9_ENCODER
} // namespace
diff --git a/media/libvpx/libvpx/test/frame_size_tests.cc b/media/libvpx/libvpx/test/frame_size_tests.cc
index eea5647a78..6306e4f2ca 100644
--- a/media/libvpx/libvpx/test/frame_size_tests.cc
+++ b/media/libvpx/libvpx/test/frame_size_tests.cc
@@ -193,7 +193,7 @@ TEST_F(VP9FrameSizeTestsLarge, ValidSizes) {
// size or almost 1 gig of memory.
// In total the allocations will exceed 2GiB which may cause a failure with
// mingw + wine, use a smaller size in that case.
-#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__)
+#if defined(_WIN32) && !defined(_WIN64)
video.SetSize(4096, 3072);
#else
video.SetSize(4096, 4096);
diff --git a/media/libvpx/libvpx/test/init_vpx_test.cc b/media/libvpx/libvpx/test/init_vpx_test.cc
index f66f00b5c1..353c5043eb 100644
--- a/media/libvpx/libvpx/test/init_vpx_test.cc
+++ b/media/libvpx/libvpx/test/init_vpx_test.cc
@@ -57,6 +57,9 @@ void init_vpx_test() {
if (!(caps & HAS_SVE)) {
append_negative_gtest_filter(":SVE.*:SVE/*");
}
+ if (!(caps & HAS_SVE2)) {
+ append_negative_gtest_filter(":SVE2.*:SVE2/*");
+ }
#elif VPX_ARCH_ARM
const int caps = arm_cpu_caps();
if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*");
diff --git a/media/libvpx/libvpx/test/resize_test.cc b/media/libvpx/libvpx/test/resize_test.cc
index 20ad2229b4..f27bd7ebbc 100644
--- a/media/libvpx/libvpx/test/resize_test.cc
+++ b/media/libvpx/libvpx/test/resize_test.cc
@@ -7,8 +7,6 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <stdio.h>
-
#include <climits>
#include <vector>
#include "third_party/googletest/src/include/gtest/gtest.h"
@@ -598,6 +596,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
mismatch_nframes_ = 0;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
unsigned int last_w = cfg_.g_w;
unsigned int last_h = cfg_.g_h;
int resize_count = 0;
@@ -613,12 +612,12 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
}
}
-#if CONFIG_VP9_DECODER
// Verify that we get 1 resize down event in this test.
ASSERT_EQ(1, resize_count) << "Resizing should occur.";
EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
#else
- printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+ GTEST_SKIP()
+ << "Warning: VP9 decoder unavailable, unable to check resize count!\n";
#endif
}
@@ -669,7 +668,8 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
ASSERT_EQ(resize_count, 4) << "Resizing should occur twice.";
EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
#else
- printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+ GTEST_SKIP()
+ << "Warning: VP9 decoder unavailable, unable to check resize count!\n";
#endif
}
diff --git a/media/libvpx/libvpx/test/sum_squares_test.cc b/media/libvpx/libvpx/test/sum_squares_test.cc
index d3c76a34d2..57037f1e30 100644
--- a/media/libvpx/libvpx/test/sum_squares_test.cc
+++ b/media/libvpx/libvpx/test/sum_squares_test.cc
@@ -119,6 +119,13 @@ INSTANTIATE_TEST_SUITE_P(
&vpx_sum_squares_2d_i16_neon)));
#endif // HAVE_NEON
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, SumSquaresTest,
+ ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+ &vpx_sum_squares_2d_i16_sve)));
+#endif // HAVE_SVE
+
#if HAVE_SSE2
INSTANTIATE_TEST_SUITE_P(
SSE2, SumSquaresTest,
diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc
index b8320e9ceb..5cf6a5fb8e 100644
--- a/media/libvpx/libvpx/test/variance_test.cc
+++ b/media/libvpx/libvpx/test/variance_test.cc
@@ -29,6 +29,9 @@ namespace {
typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride);
+typedef void (*GetVarianceFunc)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse, int *sum);
typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
using libvpx_test::ACMRandom;
@@ -63,35 +66,65 @@ static unsigned int mb_ss_ref(const int16_t *src) {
* Our codebase calculates the "diff" value in the variance algorithm by
* (src - ref).
*/
-static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
- int l2h, int src_stride, int ref_stride,
- uint32_t *sse_ptr, bool use_high_bit_depth_,
- vpx_bit_depth_t bit_depth) {
- int64_t se = 0;
- uint64_t sse = 0;
- const int w = 1 << l2w;
- const int h = 1 << l2h;
+static void variance(const uint8_t *src, int src_stride, const uint8_t *ref,
+ int ref_stride, int w, int h, bool use_high_bit_depth_,
+ uint64_t *sse, int64_t *se, vpx_bit_depth_t bit_depth) {
+ int64_t se_long = 0;
+ uint64_t sse_long = 0;
+
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
- int diff;
+ int diff = 0;
if (!use_high_bit_depth_) {
diff = src[y * src_stride + x] - ref[y * ref_stride + x];
- se += diff;
- sse += diff * diff;
#if CONFIG_VP9_HIGHBITDEPTH
} else {
diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] -
CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x];
- se += diff;
- sse += diff * diff;
#endif // CONFIG_VP9_HIGHBITDEPTH
}
+ se_long += diff;
+ sse_long += diff * diff;
}
}
- RoundHighBitDepth(bit_depth, &se, &sse);
- *sse_ptr = static_cast<uint32_t>(sse);
+
+ RoundHighBitDepth(bit_depth, &se_long, &sse_long);
+
+ *sse = sse_long;
+ *se = se_long;
+}
+
+static void get_variance_ref(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int l2w,
+ int l2h, bool use_high_bit_depth_, uint32_t *sse,
+ int *se, vpx_bit_depth_t bit_depth) {
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+ int64_t se_long = 0;
+ uint64_t sse_long = 0;
+
+ variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_,
+ &sse_long, &se_long, bit_depth);
+
+ *sse = static_cast<uint32_t>(sse_long);
+ *se = static_cast<int>(se_long);
+}
+
+static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
+ int l2h, int src_stride, int ref_stride,
+ uint32_t *sse_ptr, bool use_high_bit_depth_,
+ vpx_bit_depth_t bit_depth) {
+ const int w = 1 << l2w;
+ const int h = 1 << l2h;
+ int64_t se_long = 0;
+ uint64_t sse_long = 0;
+
+ variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_,
+ &sse_long, &se_long, bit_depth);
+
+ *sse_ptr = static_cast<uint32_t>(sse_long);
return static_cast<uint32_t>(
- sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+ sse_long - ((static_cast<int64_t>(se_long) * se_long) >> (l2w + l2h)));
}
/* The subpel reference functions differ from the codec version in one aspect:
@@ -337,6 +370,9 @@ class MainTestClass
void OneQuarterTest();
void SpeedTest();
+ // GetVariance tests
+ void RefTestGetVar();
+
// MSE/SSE tests
void RefTestMse();
void RefTestSse();
@@ -493,6 +529,35 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
}
////////////////////////////////////////////////////////////////////////////////
+// Tests related to GetVariance.
+template <typename GetVarianceFunctionType>
+void MainTestClass<GetVarianceFunctionType>::RefTestGetVar() {
+ for (int i = 0; i < 10; ++i) {
+ for (int j = 0; j < block_size(); j++) {
+ if (!use_high_bit_depth()) {
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ }
+ unsigned int sse1, sse2;
+ int sum1, sum2;
+ const int stride = width();
+ ASM_REGISTER_STATE_CHECK(
+ params_.func(src_, stride, ref_, stride, &sse1, &sum1));
+ get_variance_ref(src_, stride, ref_, stride, params_.log2width,
+ params_.log2height, use_high_bit_depth(), &sse2, &sum2,
+ params_.bit_depth);
+ EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+ EXPECT_EQ(sum1, sum2) << "Error at test index: " << i;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
// Tests related to MSE / SSE.
template <typename FunctionType>
@@ -766,6 +831,7 @@ void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
typedef MainTestClass<Get4x4SseFunc> VpxSseTest;
typedef MainTestClass<vpx_variance_fn_t> VpxMseTest;
typedef MainTestClass<vpx_variance_fn_t> VpxVarianceTest;
+typedef MainTestClass<GetVarianceFunc> VpxGetVarianceTest;
typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxSubpelVarianceTest;
typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxSubpelAvgVarianceTest;
@@ -779,6 +845,7 @@ TEST_P(VpxVarianceTest, Ref) { RefTest(); }
TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(VpxGetVarianceTest, RefGetVar) { RefTestGetVar(); }
TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
@@ -818,6 +885,16 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 3, &vpx_variance4x8_c),
VarianceParams(2, 2, &vpx_variance4x4_c)));
+typedef TestParams<GetVarianceFunc> GetVarianceParams;
+INSTANTIATE_TEST_SUITE_P(
+ C, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_c),
+ GetVarianceParams(3, 3, &vpx_get8x8var_c),
+ GetVarianceParams(4, 4, &vpx_get16x16var_c),
+ GetVarianceParams(3, 3, &vpx_get8x8var_c),
+ GetVarianceParams(4, 4, &vpx_get16x16var_c),
+ GetVarianceParams(3, 3, &vpx_get8x8var_c)));
+
typedef TestParams<vpx_subpixvariance_fn_t> SubpelVarianceParams;
INSTANTIATE_TEST_SUITE_P(
C, VpxSubpelVarianceTest,
@@ -856,6 +933,7 @@ INSTANTIATE_TEST_SUITE_P(
#if CONFIG_VP9_HIGHBITDEPTH
typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest;
+typedef MainTestClass<GetVarianceFunc> VpxHBDGetVarianceTest;
typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest;
typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t>
VpxHBDSubpelAvgVarianceTest;
@@ -865,6 +943,7 @@ TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(VpxHBDGetVarianceTest, RefGetVar) { RefTestGetVar(); }
TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
@@ -933,6 +1012,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
INSTANTIATE_TEST_SUITE_P(
+ C, VpxHBDGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_c, 12),
+ GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_c, 12),
+ GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_c, 10),
+ GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_c, 10),
+ GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_c, 8),
+ GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_c, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
C, VpxHBDSubpelVarianceTest,
::testing::Values(
SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
@@ -1119,6 +1207,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 2, &vpx_variance4x4_sse2)));
INSTANTIATE_TEST_SUITE_P(
+ SSE2, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+ GetVarianceParams(3, 3, &vpx_get8x8var_sse2),
+ GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+ GetVarianceParams(3, 3, &vpx_get8x8var_sse2),
+ GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+ GetVarianceParams(3, 3, &vpx_get8x8var_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
SSE2, VpxSubpelVarianceTest,
::testing::Values(
SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
@@ -1198,6 +1295,16 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sse2, 8)));
INSTANTIATE_TEST_SUITE_P(
+ SSE2, VpxHBDGetVarianceTest,
+ ::testing::Values(
+ GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sse2, 12),
+ GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sse2, 12),
+ GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sse2, 10),
+ GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sse2, 10),
+ GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sse2, 8),
+ GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sse2, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
SSE2, VpxHBDSubpelVarianceTest,
::testing::Values(
SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2,
@@ -1475,6 +1582,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 3, &vpx_variance4x8_neon),
VarianceParams(2, 2, &vpx_variance4x4_neon)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon),
+ GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon),
+ GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon)));
+
#if HAVE_NEON_DOTPROD
INSTANTIATE_TEST_SUITE_P(
NEON_DOTPROD, VpxSseTest,
@@ -1502,6 +1618,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod),
VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod),
VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON_DOTPROD, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod),
+ GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod),
+ GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+ GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod)));
#endif // HAVE_NEON_DOTPROD
INSTANTIATE_TEST_SUITE_P(
@@ -1555,9 +1680,6 @@ INSTANTIATE_TEST_SUITE_P(
MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
-// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can
-// be used again.
-#if 0
#if HAVE_NEON_DOTPROD
INSTANTIATE_TEST_SUITE_P(
NEON_DOTPROD, VpxHBDMseTest,
@@ -1567,7 +1689,19 @@ INSTANTIATE_TEST_SUITE_P(
MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
#endif // HAVE_NEON_DOTPROD
-#endif // 0
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, VpxHBDMseTest,
+ ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sve, VPX_BITS_12),
+ MseParams(4, 3, &vpx_highbd_12_mse16x8_sve, VPX_BITS_12),
+ MseParams(3, 4, &vpx_highbd_12_mse8x16_sve, VPX_BITS_12),
+ MseParams(3, 3, &vpx_highbd_12_mse8x8_sve, VPX_BITS_12),
+ MseParams(4, 4, &vpx_highbd_10_mse16x16_sve, VPX_BITS_10),
+ MseParams(4, 3, &vpx_highbd_10_mse16x8_sve, VPX_BITS_10),
+ MseParams(3, 4, &vpx_highbd_10_mse8x16_sve, VPX_BITS_10),
+ MseParams(3, 3, &vpx_highbd_10_mse8x8_sve, VPX_BITS_10)));
+#endif // HAVE_SVE
INSTANTIATE_TEST_SUITE_P(
NEON, VpxHBDVarianceTest,
@@ -1613,6 +1747,28 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8)));
INSTANTIATE_TEST_SUITE_P(
+ NEON, VpxHBDGetVarianceTest,
+ ::testing::Values(
+ GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_neon, 12),
+ GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_neon, 12),
+ GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_neon, 10),
+ GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_neon, 10),
+ GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_neon, 8),
+ GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_neon, 8)));
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+ SVE, VpxHBDGetVarianceTest,
+ ::testing::Values(
+ GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sve, 12),
+ GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sve, 12),
+ GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sve, 10),
+ GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sve, 10),
+ GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sve, 8),
+ GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sve, 8)));
+#endif // HAVE_SVE
+
+INSTANTIATE_TEST_SUITE_P(
NEON, VpxHBDSubpelVarianceTest,
::testing::Values(
SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon,
@@ -1815,6 +1971,53 @@ INSTANTIATE_TEST_SUITE_P(
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_NEON
+#if HAVE_SVE
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+ SVE, VpxHBDVarianceTest,
+ ::testing::Values(
+ VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sve, 12),
+ VarianceParams(6, 5, &vpx_highbd_12_variance64x32_sve, 12),
+ VarianceParams(5, 6, &vpx_highbd_12_variance32x64_sve, 12),
+ VarianceParams(5, 5, &vpx_highbd_12_variance32x32_sve, 12),
+ VarianceParams(5, 4, &vpx_highbd_12_variance32x16_sve, 12),
+ VarianceParams(4, 5, &vpx_highbd_12_variance16x32_sve, 12),
+ VarianceParams(4, 4, &vpx_highbd_12_variance16x16_sve, 12),
+ VarianceParams(4, 3, &vpx_highbd_12_variance16x8_sve, 12),
+ VarianceParams(3, 4, &vpx_highbd_12_variance8x16_sve, 12),
+ VarianceParams(3, 3, &vpx_highbd_12_variance8x8_sve, 12),
+ VarianceParams(3, 2, &vpx_highbd_12_variance8x4_sve, 12),
+ VarianceParams(2, 3, &vpx_highbd_12_variance4x8_sve, 12),
+ VarianceParams(2, 2, &vpx_highbd_12_variance4x4_sve, 12),
+ VarianceParams(6, 6, &vpx_highbd_10_variance64x64_sve, 10),
+ VarianceParams(6, 5, &vpx_highbd_10_variance64x32_sve, 10),
+ VarianceParams(5, 6, &vpx_highbd_10_variance32x64_sve, 10),
+ VarianceParams(5, 5, &vpx_highbd_10_variance32x32_sve, 10),
+ VarianceParams(5, 4, &vpx_highbd_10_variance32x16_sve, 10),
+ VarianceParams(4, 5, &vpx_highbd_10_variance16x32_sve, 10),
+ VarianceParams(4, 4, &vpx_highbd_10_variance16x16_sve, 10),
+ VarianceParams(4, 3, &vpx_highbd_10_variance16x8_sve, 10),
+ VarianceParams(3, 4, &vpx_highbd_10_variance8x16_sve, 10),
+ VarianceParams(3, 3, &vpx_highbd_10_variance8x8_sve, 10),
+ VarianceParams(3, 2, &vpx_highbd_10_variance8x4_sve, 10),
+ VarianceParams(2, 3, &vpx_highbd_10_variance4x8_sve, 10),
+ VarianceParams(2, 2, &vpx_highbd_10_variance4x4_sve, 10),
+ VarianceParams(6, 6, &vpx_highbd_8_variance64x64_sve, 8),
+ VarianceParams(6, 5, &vpx_highbd_8_variance64x32_sve, 8),
+ VarianceParams(5, 6, &vpx_highbd_8_variance32x64_sve, 8),
+ VarianceParams(5, 5, &vpx_highbd_8_variance32x32_sve, 8),
+ VarianceParams(5, 4, &vpx_highbd_8_variance32x16_sve, 8),
+ VarianceParams(4, 5, &vpx_highbd_8_variance16x32_sve, 8),
+ VarianceParams(4, 4, &vpx_highbd_8_variance16x16_sve, 8),
+ VarianceParams(4, 3, &vpx_highbd_8_variance16x8_sve, 8),
+ VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sve, 8),
+ VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sve, 8),
+ VarianceParams(3, 2, &vpx_highbd_8_variance8x4_sve, 8),
+ VarianceParams(2, 3, &vpx_highbd_8_variance4x8_sve, 8),
+ VarianceParams(2, 2, &vpx_highbd_8_variance4x4_sve, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_SVE
+
#if HAVE_MSA
INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest,
::testing::Values(vpx_get_mb_ss_msa));
@@ -1846,6 +2049,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(2, 2, &vpx_variance4x4_msa)));
INSTANTIATE_TEST_SUITE_P(
+ MSA, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+ GetVarianceParams(3, 3, &vpx_get8x8var_msa),
+ GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+ GetVarianceParams(3, 3, &vpx_get8x8var_msa),
+ GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+ GetVarianceParams(3, 3, &vpx_get8x8var_msa)));
+
+INSTANTIATE_TEST_SUITE_P(
MSA, VpxSubpelVarianceTest,
::testing::Values(
SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
@@ -1908,6 +2120,15 @@ INSTANTIATE_TEST_SUITE_P(
VarianceParams(3, 2, &vpx_variance8x4_vsx),
VarianceParams(2, 3, &vpx_variance4x8_vsx),
VarianceParams(2, 2, &vpx_variance4x4_vsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+ VSX, VpxGetVarianceTest,
+ ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+ GetVarianceParams(3, 3, &vpx_get8x8var_vsx),
+ GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+ GetVarianceParams(3, 3, &vpx_get8x8var_vsx),
+ GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+ GetVarianceParams(3, 3, &vpx_get8x8var_vsx)));
#endif // HAVE_VSX
#if HAVE_MMI
diff --git a/media/libvpx/libvpx/test/video_source.h b/media/libvpx/libvpx/test/video_source.h
index 2194126f1f..2c035910db 100644
--- a/media/libvpx/libvpx/test/video_source.h
+++ b/media/libvpx/libvpx/test/video_source.h
@@ -236,7 +236,6 @@ class RandomVideoSource : public DummyVideoSource {
RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
: rnd_(seed), seed_(seed) {}
- protected:
// Reset the RNG to get a matching stream for the second pass
void Begin() override {
frame_ = 0;
@@ -244,6 +243,7 @@ class RandomVideoSource : public DummyVideoSource {
FillFrame();
}
+ protected:
// 15 frames of noise, followed by 15 static frames. Reset to 0 rather
// than holding previous frames to encourage keyframes to be thrown.
void FillFrame() override {
diff --git a/media/libvpx/libvpx/test/vp8_datarate_test.cc b/media/libvpx/libvpx/test/vp8_datarate_test.cc
index aee27af66e..d47ed298fe 100644
--- a/media/libvpx/libvpx/test/vp8_datarate_test.cc
+++ b/media/libvpx/libvpx/test/vp8_datarate_test.cc
@@ -14,7 +14,7 @@
#include "test/i420_video_source.h"
#include "test/util.h"
#include "test/y4m_video_source.h"
-#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
namespace {
@@ -260,6 +260,27 @@ class DatarateTestLarge
<< " The datarate for the file missed the target!";
}
+ virtual void MultiThreadsPSNRTest() {
+ denoiser_on_ = 0;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_dropframe_thresh = 0;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_threads = 4;
+ init_flags_ = VPX_CODEC_USE_PSNR;
+
+ ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+ 1280, 720, 30, 1, 0, 30);
+ cfg_.rc_target_bitrate = 1000;
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.5)
+ << " The datarate for the file exceeds the target!";
+
+ ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 2.0)
+ << " The datarate for the file missed the target!";
+ }
+
vpx_codec_pts_t last_pts_;
int64_t bits_in_buffer_model_;
double timebase_;
@@ -324,6 +345,8 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
DropFramesMultiThreadsTest();
}
+TEST_P(DatarateTestRealTime, MultiThreadsPSNR) { MultiThreadsPSNRTest(); }
+
TEST_P(DatarateTestRealTime, RegionOfInterest) {
denoiser_on_ = 0;
cfg_.rc_buf_initial_sz = 500;
diff --git a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
index 50478f7635..d87fef5a46 100644
--- a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
+++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
@@ -149,9 +149,16 @@ class Vp8RcInterfaceTest
return;
}
int qp;
+ libvpx::UVDeltaQP uv_delta_qp;
encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
ASSERT_EQ(rc_api_->GetQP(), qp);
+ uv_delta_qp = rc_api_->GetUVDeltaQP();
+ // delta_qp for UV channel is only set for screen.
+ if (!rc_cfg_.is_screen) {
+ ASSERT_EQ(uv_delta_qp.uvdc_delta_q, 0);
+ ASSERT_EQ(uv_delta_qp.uvac_delta_q, 0);
+ }
} else {
num_drops_++;
}
diff --git a/media/libvpx/libvpx/test/vp9_block_error_test.cc b/media/libvpx/libvpx/test/vp9_block_error_test.cc
index 0645341ac1..c5ddcd58ab 100644
--- a/media/libvpx/libvpx/test/vp9_block_error_test.cc
+++ b/media/libvpx/libvpx/test/vp9_block_error_test.cc
@@ -215,4 +215,13 @@ const BlockErrorParam neon_block_error_tests[] = {
INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest,
::testing::ValuesIn(neon_block_error_tests));
#endif // HAVE_NEON
+
+#if HAVE_SVE
+const BlockErrorParam sve_block_error_tests[] = { make_tuple(
+ &BlockError8BitWrapper<vp9_block_error_sve>,
+ &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8) };
+
+INSTANTIATE_TEST_SUITE_P(SVE, BlockErrorTest,
+ ::testing::ValuesIn(sve_block_error_tests));
+#endif // HAVE_SVE
} // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
index 33fa05c65c..5c23a5b0d5 100644
--- a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
@@ -10,115 +10,78 @@
#include <cstdint>
#include <new>
+#include <memory>
+
+#include "./vpx_config.h"
#include "test/codec_factory.h"
#include "test/encode_test_driver.h"
#include "test/util.h"
#include "test/yuv_video_source.h"
#include "third_party/googletest/src/include/gtest/gtest.h"
+#if CONFIG_VP9_DECODER
+#include "vpx/vp8dx.h"
+#endif
#include "vp9/simple_encode.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_image.h"
#include "vpx/vpx_tpl.h"
#include "vpx_dsp/vpx_dsp_common.h"
namespace {
-constexpr int kModelMagicNumber = 51396;
-constexpr uintptr_t PrivMagicNumber = 5566;
-constexpr int kFrameNum = 5;
-constexpr int kFrameNumGOP = 30;
-constexpr int kFrameNumGOPShort = 4;
-constexpr int kLosslessCodingIndex = 2;
-constexpr int kFixedGOPSize = 9;
-// The range check in vp9_cx_iface.c shows that the max
-// lag in buffer is MAX_LAG_BUFFERS (25):
-// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
-constexpr int kMaxLagInFrames = 25;
-constexpr int kDefaultMinGfInterval = 4;
-constexpr int kDefaultMaxGfInterval = 16;
-// The active gf interval might change for each GOP
-// See function "get_active_gf_inverval_range".
-// The numbers below are from manual inspection.
-constexpr int kReadMinGfInterval = 5;
-constexpr int kReadMaxGfInterval = 13;
-const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
-const double kPsnrThreshold = 30.4;
-
-struct ToyRateCtrl {
- int magic_number;
- int coding_index;
-
- int gop_global_index;
- int frames_since_key;
- int show_index;
+constexpr int kFrameNum = 10;
+constexpr int kFixedGOPSize = 10;
+constexpr int kKeyframeQp = 10;
+constexpr int kLeafQp = 40;
+constexpr int kArfQp = 15;
+
+// Simple external rate controller for testing.
+class RateControllerForTest {
+ public:
+ RateControllerForTest() : current_gop_(-1) {}
+ ~RateControllerForTest() {}
+
+ void StartNextGop() { ++current_gop_; }
+
+ vpx_rc_gop_decision_t GetCurrentGop() const {
+ vpx_rc_gop_decision_t gop_decision;
+ gop_decision.use_key_frame = current_gop_ == 0 ? 1 : 0;
+ gop_decision.use_alt_ref = 1;
+ gop_decision.gop_coding_frames = kFixedGOPSize;
+ return gop_decision;
+ }
+
+ int CalculateFrameDecision(int frame_index) {
+ EXPECT_LE(frame_index, kFixedGOPSize);
+ if (current_gop_ == 0 && frame_index == 0) {
+ // Key frame, first frame in the first GOP.
+ return kKeyframeQp;
+ } else if (frame_index == 1) {
+ // ARF, we always use ARF for this test.
+ return kArfQp;
+ } else {
+ return kLeafQp;
+ }
+ }
+ int current_gop_;
};
-vpx_rc_status_t rc_create_model(void *priv,
- const vpx_rc_config_t *ratectrl_config,
- vpx_rc_model_t *rate_ctrl_model_ptr) {
- ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
- if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
- toy_rate_ctrl->magic_number = kModelMagicNumber;
- toy_rate_ctrl->coding_index = -1;
- *rate_ctrl_model_ptr = toy_rate_ctrl;
- EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
- EXPECT_EQ(ratectrl_config->frame_width, 352);
- EXPECT_EQ(ratectrl_config->frame_height, 288);
- EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNum);
- EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000);
- EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
- EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_create_model_gop(void *priv,
- const vpx_rc_config_t *ratectrl_config,
- vpx_rc_model_t *rate_ctrl_model_ptr) {
- ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
- if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
- toy_rate_ctrl->magic_number = kModelMagicNumber;
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- toy_rate_ctrl->show_index = 0;
- toy_rate_ctrl->coding_index = 0;
- *rate_ctrl_model_ptr = toy_rate_ctrl;
- EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
- EXPECT_EQ(ratectrl_config->frame_width, 640);
- EXPECT_EQ(ratectrl_config->frame_height, 360);
- EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP);
- EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000);
- EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
- EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_create_model_gop_short(
- void *priv, const vpx_rc_config_t *ratectrl_config,
+// Callbacks used in this test.
+vpx_rc_status_t rc_test_create_model(
+ void * /*priv*/, const vpx_rc_config_t * /*ratectrl_config*/,
vpx_rc_model_t *rate_ctrl_model_ptr) {
- ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
- if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
- toy_rate_ctrl->magic_number = kModelMagicNumber;
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- toy_rate_ctrl->show_index = 0;
- toy_rate_ctrl->coding_index = 0;
- *rate_ctrl_model_ptr = toy_rate_ctrl;
- EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
- EXPECT_EQ(ratectrl_config->frame_width, 352);
- EXPECT_EQ(ratectrl_config->frame_height, 288);
- EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort);
- EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500);
- EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
- EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+ std::unique_ptr<RateControllerForTest> test_controller(
+ new RateControllerForTest());
+ *rate_ctrl_model_ptr = test_controller.release();
return VPX_RC_OK;
}
-vpx_rc_status_t rc_send_firstpass_stats(
- vpx_rc_model_t rate_ctrl_model,
+vpx_rc_status_t rc_test_send_firstpass_stats(
+ vpx_rc_model_t /*rate_ctrl_model*/,
const vpx_rc_firstpass_stats_t *first_pass_stats) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
EXPECT_EQ(first_pass_stats->num_frames, kFrameNum);
for (int i = 0; i < first_pass_stats->num_frames; ++i) {
EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
@@ -126,37 +89,8 @@ vpx_rc_status_t rc_send_firstpass_stats(
return VPX_RC_OK;
}
-vpx_rc_status_t rc_send_firstpass_stats_gop(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_firstpass_stats_t *first_pass_stats) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP);
- for (int i = 0; i < first_pass_stats->num_frames; ++i) {
- EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
- }
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_send_firstpass_stats_gop_short(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_firstpass_stats_t *first_pass_stats) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort);
- for (int i = 0; i < first_pass_stats->num_frames; ++i) {
- EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
- }
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
- const VpxTplGopStats *tpl_gop_stats) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+vpx_rc_status_t rc_test_send_tpl_gop_stats(
+ vpx_rc_model_t /*rate_ctrl_model*/, const VpxTplGopStats *tpl_gop_stats) {
EXPECT_GT(tpl_gop_stats->size, 0);
for (int i = 0; i < tpl_gop_stats->size; ++i) {
@@ -165,522 +99,38 @@ vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
return VPX_RC_OK;
}
-vpx_rc_status_t rc_get_encodeframe_decision(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
+vpx_rc_status_t rc_test_get_encodeframe_decision(
+ vpx_rc_model_t rate_ctrl_model, const int frame_gop_index,
vpx_rc_encodeframe_decision_t *frame_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- toy_rate_ctrl->coding_index += 1;
-
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
- EXPECT_LT(encode_frame_info->show_index, kFrameNum);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- if (encode_frame_info->coding_index == 0) {
- EXPECT_EQ(encode_frame_info->show_index, 0);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- } else if (encode_frame_info->coding_index == 1) {
- EXPECT_EQ(encode_frame_info->show_index, 4);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 0); // kRefFrameTypeLast
- } else if (encode_frame_info->coding_index >= 2 &&
- encode_frame_info->coding_index < 5) {
- // In the first group of pictures, coding_index and gop_index are equal.
- EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- } else if (encode_frame_info->coding_index == 5) {
- EXPECT_EQ(encode_frame_info->show_index, 4);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 1); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 1); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 4); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[2],
- 1); // kRefFrameTypeFuture
- }
- if (encode_frame_info->coding_index == kLosslessCodingIndex) {
- // We should get sse == 0 at rc_update_encodeframe_result()
- frame_decision->q_index = 0;
- } else {
- frame_decision->q_index = 100;
- }
- frame_decision->max_frame_size = 0;
+ RateControllerForTest *test_controller =
+ static_cast<RateControllerForTest *>(rate_ctrl_model);
+ frame_decision->q_index =
+ test_controller->CalculateFrameDecision(frame_gop_index);
return VPX_RC_OK;
}
-vpx_rc_status_t rc_get_encodeframe_decision_gop(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
- vpx_rc_encodeframe_decision_t *frame_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- if (encode_frame_info->coding_index == 0) {
- EXPECT_EQ(encode_frame_info->show_index, 0);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- } else if (encode_frame_info->coding_index == 1) {
- EXPECT_EQ(encode_frame_info->show_index, 1);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 0); // kRefFrameTypeLast
- } else if (encode_frame_info->coding_index == 2) {
- EXPECT_EQ(encode_frame_info->show_index, 2);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- } else if (encode_frame_info->coding_index == 3 ||
- encode_frame_info->coding_index == 12 ||
- encode_frame_info->coding_index == 21) {
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- } else if (encode_frame_info->coding_index == 11 ||
- encode_frame_info->coding_index == 20 ||
- encode_frame_info->coding_index == 29) {
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- } else if (encode_frame_info->coding_index >= 30) {
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- }
-
- // When the model recommends an invalid q, valid range [0, 255],
- // the encoder will ignore it and use the default q selected
- // by libvpx rate control strategy.
- frame_decision->q_index = VPX_DEFAULT_Q;
- frame_decision->max_frame_size = 0;
-
- toy_rate_ctrl->coding_index += 1;
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
- vpx_rc_encodeframe_decision_t *frame_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- if (encode_frame_info->coding_index == 0) {
- EXPECT_EQ(encode_frame_info->show_index, 0);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 1) {
- EXPECT_EQ(encode_frame_info->show_index, 1);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 2) {
- EXPECT_EQ(encode_frame_info->show_index, 2);
- EXPECT_EQ(encode_frame_info->gop_index, 2);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 3) {
- EXPECT_EQ(encode_frame_info->show_index, 3);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
- }
-
- // When the model recommends an invalid q, valid range [0, 255],
- // the encoder will ignore it and use the default q selected
- // by libvpx rate control strategy.
- frame_decision->q_index = VPX_DEFAULT_Q;
- frame_decision->max_frame_size = 0;
-
- toy_rate_ctrl->coding_index += 1;
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
- vpx_rc_encodeframe_decision_t *frame_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- if (encode_frame_info->coding_index == 0) {
- EXPECT_EQ(encode_frame_info->show_index, 0);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 1) {
- EXPECT_EQ(encode_frame_info->show_index, 3);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 2) {
- EXPECT_EQ(encode_frame_info->show_index, 1);
- EXPECT_EQ(encode_frame_info->gop_index, 2);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 3) {
- EXPECT_EQ(encode_frame_info->show_index, 2);
- EXPECT_EQ(encode_frame_info->gop_index, 3);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 4) {
- EXPECT_EQ(encode_frame_info->show_index, 3);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- }
-
- // When the model recommends an invalid q, valid range [0, 255],
- // the encoder will ignore it and use the default q selected
- // by libvpx rate control strategy.
- frame_decision->q_index = VPX_DEFAULT_Q;
- frame_decision->max_frame_size = 0;
-
- toy_rate_ctrl->coding_index += 1;
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
- vpx_rc_encodeframe_decision_t *frame_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- if (encode_frame_info->coding_index == 0) {
- EXPECT_EQ(encode_frame_info->show_index, 0);
- EXPECT_EQ(encode_frame_info->gop_index, 0);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 1) {
- EXPECT_EQ(encode_frame_info->show_index, 1);
- EXPECT_EQ(encode_frame_info->gop_index, 1);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
- 1); // kRefFrameTypeLast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
- 0); // kRefFrameTypePast
- EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
- 0); // kRefFrameTypeFuture
- EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
- 0); // kRefFrameTypeLast
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 2) {
- EXPECT_EQ(encode_frame_info->show_index, 2);
- EXPECT_EQ(encode_frame_info->gop_index, 2);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- } else if (encode_frame_info->coding_index == 3) {
- EXPECT_EQ(encode_frame_info->show_index, 3);
- EXPECT_EQ(encode_frame_info->gop_index, 3);
- EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
- EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
- }
-
- // When the model recommends an invalid q, valid range [0, 255],
- // the encoder will ignore it and use the default q selected
- // by libvpx rate control strategy.
- frame_decision->q_index = VPX_DEFAULT_Q;
- frame_decision->max_frame_size = 0;
-
- toy_rate_ctrl->coding_index += 1;
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_gop_info_t *gop_info,
- vpx_rc_gop_decision_t *gop_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames);
- EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
- EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
- EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval);
- EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval);
- EXPECT_EQ(gop_info->allow_alt_ref, 1);
- if (gop_info->is_key_frame) {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- EXPECT_EQ(gop_info->frames_since_key, 0);
- EXPECT_EQ(gop_info->gop_global_index, 0);
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- } else {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
- }
- EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
- EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
- EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
- EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
- gop_decision->gop_coding_frames =
- VPXMIN(kFixedGOPSize, gop_info->frames_to_key);
- gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize;
- toy_rate_ctrl->frames_since_key +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- toy_rate_ctrl->show_index +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- ++toy_rate_ctrl->gop_global_index;
- return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 2 GOPs.
-// The first GOP has 3 coding frames, no alt ref.
-// The second GOP has 1 coding frame, no alt ref.
-vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_gop_info_t *gop_info,
- vpx_rc_gop_decision_t *gop_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
- EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
- EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
- EXPECT_EQ(gop_info->allow_alt_ref, 1);
- if (gop_info->is_key_frame) {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- EXPECT_EQ(gop_info->frames_since_key, 0);
- EXPECT_EQ(gop_info->gop_global_index, 0);
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- } else {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- }
- EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
- EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
- EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
- EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
- gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1;
- gop_decision->use_alt_ref = 0;
- toy_rate_ctrl->frames_since_key +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- toy_rate_ctrl->show_index +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- ++toy_rate_ctrl->gop_global_index;
- return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 2 GOPs.
-// The first GOP has 4 coding frames. Use alt ref.
-// The second GOP only contains the overlay frame of the first GOP's alt ref
-// frame.
-vpx_rc_status_t rc_get_gop_decision_short_overlay(
- vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
- vpx_rc_gop_decision_t *gop_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
- EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
- EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
- EXPECT_EQ(gop_info->allow_alt_ref, 1);
- if (gop_info->is_key_frame) {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- EXPECT_EQ(gop_info->frames_since_key, 0);
- EXPECT_EQ(gop_info->gop_global_index, 0);
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- } else {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
- }
- EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
- EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
- EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
- EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
- gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
- gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0;
- toy_rate_ctrl->frames_since_key +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- toy_rate_ctrl->show_index +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- ++toy_rate_ctrl->gop_global_index;
- return VPX_RC_OK;
-}
-
-// Test on a 4 frame video.
-// Test a setting of 1 GOP.
-// The GOP has 4 coding frames. Do not use alt ref.
-vpx_rc_status_t rc_get_gop_decision_short_no_arf(
- vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
- vpx_rc_gop_decision_t *gop_decision) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
- EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
- EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
- EXPECT_EQ(gop_info->allow_alt_ref, 1);
- if (gop_info->is_key_frame) {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- EXPECT_EQ(gop_info->frames_since_key, 0);
- EXPECT_EQ(gop_info->gop_global_index, 0);
- toy_rate_ctrl->gop_global_index = 0;
- toy_rate_ctrl->frames_since_key = 0;
- } else {
- EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
- }
- EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
- EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
- EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
- EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
-
- gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
- gop_decision->use_alt_ref = 0;
- toy_rate_ctrl->frames_since_key +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- toy_rate_ctrl->show_index +=
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
- ++toy_rate_ctrl->gop_global_index;
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_result_t *encode_frame_result) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
- const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
- EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
- if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
- EXPECT_EQ(encode_frame_result->sse, 0);
- }
- if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
- EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0);
- } else {
- EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100);
- }
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result_gop(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_result_t *encode_frame_result) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
- const int64_t ref_pixel_count = 640 * 360 * 3 / 2;
- EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_update_encodeframe_result_gop_short(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_result_t *encode_frame_result) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
-
- const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
- EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
- return VPX_RC_OK;
-}
-
-vpx_rc_status_t rc_get_default_frame_rdmult(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) {
- const ToyRateCtrl *toy_rate_ctrl =
- static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
- EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
-
- *rdmult = VPX_DEFAULT_RDMULT;
+vpx_rc_status_t rc_test_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
+ vpx_rc_gop_decision_t *gop_decision) {
+ RateControllerForTest *test_controller =
+ static_cast<RateControllerForTest *>(rate_ctrl_model);
+ test_controller->StartNextGop();
+ *gop_decision = test_controller->GetCurrentGop();
return VPX_RC_OK;
}
vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
- ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
- EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
- delete toy_rate_ctrl;
+ RateControllerForTest *test_controller =
+ static_cast<RateControllerForTest *>(rate_ctrl_model);
+ delete test_controller;
return VPX_RC_OK;
}
class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
public ::testing::Test {
protected:
- ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {}
+ ExtRateCtrlTest()
+ : EncoderTest(&::libvpx_test::kVP9), frame_number_(0),
+ current_frame_qp_(0) {}
~ExtRateCtrlTest() override = default;
@@ -693,287 +143,62 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
::libvpx_test::Encoder *encoder) override {
if (video->frame() == 0) {
vpx_rc_funcs_t rc_funcs = {};
- rc_funcs.rc_type = VPX_RC_QP;
- rc_funcs.create_model = rc_create_model;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
- rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision;
- rc_funcs.update_encodeframe_result = rc_update_encodeframe_result;
- rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
- encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
- }
- }
-};
-
-TEST_F(ExtRateCtrlTest, EncodeTest) {
- cfg_.rc_target_bitrate = 24000;
-
- std::unique_ptr<libvpx_test::VideoSource> video;
- video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
- kFrameNum));
-
- ASSERT_NE(video, nullptr);
- ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
- public ::libvpx_test::CodecTestWithParam<int> {
- protected:
- ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {}
-
- ~ExtRateCtrlTestGOP() override = default;
-
- void SetUp() override {
- InitializeConfig();
- SetMode(::libvpx_test::kTwoPassGood);
- }
-
- void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
- ::libvpx_test::Encoder *encoder) override {
- if (video->frame() == 0) {
- encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
- encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
-
- vpx_rc_funcs_t rc_funcs = {};
- rc_funcs.rc_type = VPX_RC_GOP_QP;
- rc_funcs.create_model = rc_create_model_gop;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
- rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats;
- rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop;
- rc_funcs.get_gop_decision = rc_get_gop_decision;
- rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
- rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
- encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
- }
- }
-};
-
-TEST_F(ExtRateCtrlTestGOP, EncodeTest) {
- cfg_.rc_target_bitrate = 4000;
- cfg_.g_lag_in_frames = kMaxLagInFrames;
- cfg_.rc_end_usage = VPX_VBR;
-
- std::unique_ptr<libvpx_test::VideoSource> video;
- video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0,
- kFrameNumGOP));
-
- ASSERT_NE(video, nullptr);
- ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
- public ::libvpx_test::CodecTestWithParam<int> {
- protected:
- ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {}
-
- ~ExtRateCtrlTestGOPShort() override = default;
-
- void SetUp() override {
- InitializeConfig();
- SetMode(::libvpx_test::kTwoPassGood);
- }
-
- void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
- ::libvpx_test::Encoder *encoder) override {
- if (video->frame() == 0) {
- encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
- encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
- encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
- vpx_rc_funcs_t rc_funcs = {};
- rc_funcs.rc_type = VPX_RC_GOP_QP;
- rc_funcs.create_model = rc_create_model_gop_short;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
- rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
- rc_funcs.get_gop_decision = rc_get_gop_decision_short;
- rc_funcs.update_encodeframe_result =
- rc_update_encodeframe_result_gop_short;
- rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
- encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
- }
- }
-};
-
-TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) {
- cfg_.rc_target_bitrate = 500;
- cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
- cfg_.rc_end_usage = VPX_VBR;
-
- std::unique_ptr<libvpx_test::VideoSource> video;
- video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
- ASSERT_NE(video, nullptr);
- ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShortOverlay
- : public ::libvpx_test::EncoderTest,
- public ::libvpx_test::CodecTestWithParam<int> {
- protected:
- ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {}
-
- ~ExtRateCtrlTestGOPShortOverlay() override = default;
-
- void SetUp() override {
- InitializeConfig();
- SetMode(::libvpx_test::kTwoPassGood);
- }
-
- void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
- ::libvpx_test::Encoder *encoder) override {
- if (video->frame() == 0) {
- encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
- encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
- encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
- vpx_rc_funcs_t rc_funcs = {};
rc_funcs.rc_type = VPX_RC_GOP_QP;
- rc_funcs.create_model = rc_create_model_gop_short;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
- rc_funcs.get_encodeframe_decision =
- rc_get_encodeframe_decision_gop_short_overlay;
- rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay;
- rc_funcs.update_encodeframe_result =
- rc_update_encodeframe_result_gop_short;
+ rc_funcs.create_model = rc_test_create_model;
+ rc_funcs.send_firstpass_stats = rc_test_send_firstpass_stats;
+ rc_funcs.send_tpl_gop_stats = rc_test_send_tpl_gop_stats;
+ rc_funcs.get_gop_decision = rc_test_get_gop_decision;
+ rc_funcs.get_encodeframe_decision = rc_test_get_encodeframe_decision;
rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
}
}
-};
-
-TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) {
- cfg_.rc_target_bitrate = 500;
- cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
- cfg_.rc_end_usage = VPX_VBR;
-
- std::unique_ptr<libvpx_test::VideoSource> video;
- video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
- ASSERT_NE(video, nullptr);
- ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestGOPShortNoARF
- : public ::libvpx_test::EncoderTest,
- public ::libvpx_test::CodecTestWithParam<int> {
- protected:
- ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {}
-
- ~ExtRateCtrlTestGOPShortNoARF() override = default;
- void SetUp() override {
- InitializeConfig();
- SetMode(::libvpx_test::kTwoPassGood);
+#if CONFIG_VP9_DECODER
+ bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+ const ::libvpx_test::VideoSource & /*video*/,
+ ::libvpx_test::Decoder *decoder) override {
+ EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+ decoder->Control(VPXD_GET_LAST_QUANTIZER, &current_frame_qp_);
+ return VPX_CODEC_OK == res_dec;
}
- void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
- ::libvpx_test::Encoder *encoder) override {
- if (video->frame() == 0) {
- encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
- encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
- encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
-
- vpx_rc_funcs_t rc_funcs = {};
- rc_funcs.rc_type = VPX_RC_GOP_QP;
- rc_funcs.create_model = rc_create_model_gop_short;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
- rc_funcs.get_encodeframe_decision =
- rc_get_encodeframe_decision_gop_short_no_arf;
- rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf;
- rc_funcs.update_encodeframe_result =
- rc_update_encodeframe_result_gop_short;
- rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
- encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+ void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+ if (frame_number_ == 0) {
+ // This must be a key frame
+ EXPECT_TRUE((pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0);
+ EXPECT_EQ(current_frame_qp_, kKeyframeQp);
+ ++frame_number_;
+ return;
}
- }
-};
-
-TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) {
- cfg_.rc_target_bitrate = 500;
- cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
- cfg_.rc_end_usage = VPX_VBR;
-
- std::unique_ptr<libvpx_test::VideoSource> video;
- video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
-
- ASSERT_NE(video, nullptr);
- ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-}
-
-class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
- public ::testing::Test {
- protected:
- ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {}
-
- ~ExtRateCtrlTestRdmult() override = default;
-
- void SetUp() override {
- InitializeConfig();
- SetMode(::libvpx_test::kTwoPassGood);
- }
-
- void BeginPassHook(unsigned int) override {
- psnr_ = 0.0;
- nframes_ = 0;
- }
-
- void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
- psnr_ += pkt->data.psnr.psnr[0];
- nframes_++;
- }
- void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
- ::libvpx_test::Encoder *encoder) override {
- if (video->frame() == 0) {
- vpx_rc_funcs_t rc_funcs = {};
- rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
- rc_funcs.create_model = rc_create_model_gop_short;
- rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
- rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
- rc_funcs.get_gop_decision = rc_get_gop_decision_short;
- rc_funcs.update_encodeframe_result =
- rc_update_encodeframe_result_gop_short;
- rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult;
- rc_funcs.delete_model = rc_delete_model;
- rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
- encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+ if ((pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) != 0) {
+ // This is ARF
+ EXPECT_EQ(current_frame_qp_, kArfQp);
+ ++frame_number_;
+ return;
}
- }
- double GetAveragePsnr() const {
- if (nframes_) return psnr_ / nframes_;
- return 0.0;
+ EXPECT_EQ(current_frame_qp_, kLeafQp);
+ ++frame_number_;
}
+#endif // CONFIG_VP9_DECODER
- private:
- double psnr_;
- unsigned int nframes_;
+ int frame_number_;
+ int current_frame_qp_;
};
-TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) {
- cfg_.rc_target_bitrate = 500;
- cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
- cfg_.rc_end_usage = VPX_VBR;
- init_flags_ = VPX_CODEC_USE_PSNR;
+TEST_F(ExtRateCtrlTest, EncodeTest) {
+ cfg_.rc_target_bitrate = 4000;
+ cfg_.g_lag_in_frames = 25;
std::unique_ptr<libvpx_test::VideoSource> video;
video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
- kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+ "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
+ kFrameNum));
ASSERT_NE(video, nullptr);
ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
-
- const double psnr = GetAveragePsnr();
- EXPECT_GT(psnr, kPsnrThreshold);
}
} // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
index f7be47542c..a6c7563348 100644
--- a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
@@ -9,6 +9,7 @@
*/
#include "vp9/ratectrl_rtc.h"
+#include <climits>
#include <fstream> // NOLINT
#include <string>
@@ -19,6 +20,8 @@
#include "test/i420_video_source.h"
#include "test/util.h"
#include "test/video_source.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
#include "vpx/vpx_codec.h"
#include "vpx_ports/bitops.h"
diff --git a/media/libvpx/libvpx/test/vp9_scale_test.cc b/media/libvpx/libvpx/test/vp9_scale_test.cc
index 049a10a617..a5a18a7e9d 100644
--- a/media/libvpx/libvpx/test/vp9_scale_test.cc
+++ b/media/libvpx/libvpx/test/vp9_scale_test.cc
@@ -48,12 +48,11 @@ class ScaleTest : public VpxScaleBase,
}
void RunTest(INTERP_FILTER filter_type) {
- static const int kNumSizesToTest = 20;
+ static const int kNumSizesToTest = 22;
static const int kNumScaleFactorsToTest = 4;
- static const int kSizesToTest[] = {
- 2, 4, 6, 8, 10, 12, 14, 16, 18, 20,
- 22, 24, 26, 28, 30, 32, 34, 68, 128, 134
- };
+ static const int kSizesToTest[] = { 1, 2, 3, 4, 6, 8, 10, 12,
+ 14, 16, 18, 20, 22, 24, 26, 28,
+ 30, 32, 34, 68, 128, 134 };
static const int kScaleFactors[] = { 1, 2, 3, 4 };
for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
for (int h = 0; h < kNumSizesToTest; ++h) {
diff --git a/media/libvpx/libvpx/tools_common.c b/media/libvpx/libvpx/tools_common.c
index 5c13781513..5af971f720 100644
--- a/media/libvpx/libvpx/tools_common.c
+++ b/media/libvpx/libvpx/tools_common.c
@@ -26,15 +26,9 @@
#include "vpx/vpx_codec.h"
-#if defined(_WIN32) || defined(__OS2__)
+#if defined(_WIN32)
#include <io.h>
#include <fcntl.h>
-
-#ifdef __OS2__
-#define _setmode setmode
-#define _fileno fileno
-#define _O_BINARY O_BINARY
-#endif
#endif
#define LOG_ERROR(label) \
@@ -58,7 +52,7 @@ static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
FILE *set_binary_mode(FILE *stream) {
(void)stream;
-#if defined(_WIN32) || defined(__OS2__)
+#if defined(_WIN32)
_setmode(_fileno(stream), _O_BINARY);
#endif
return stream;
@@ -96,9 +90,9 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
int w = vpx_img_plane_width(yuv_frame, plane);
const int h = vpx_img_plane_height(yuv_frame, plane);
int r;
- // Assuming that for nv12 we read all chroma data at one time
+ // Assuming that for nv12 we read all chroma data at once
if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
- // Fixing NV12 chroma width it is odd
+ // Fixing NV12 chroma width if it is odd
if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
/* Determine the correct plane based on the image format. The for-loop
* always counts in Y,U,V order, but this may not match the order of
@@ -229,17 +223,22 @@ int vpx_img_plane_height(const vpx_image_t *img, int plane) {
void vpx_img_write(const vpx_image_t *img, FILE *file) {
int plane;
+ const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
for (plane = 0; plane < 3; ++plane) {
const unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int w = vpx_img_plane_width(img, plane) *
- ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+ int w = vpx_img_plane_width(img, plane);
const int h = vpx_img_plane_height(img, plane);
int y;
+ // Assuming that for nv12 we write all chroma data at once
+ if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+ // Fixing NV12 chroma width if it is odd
+ if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
for (y = 0; y < h; ++y) {
- fwrite(buf, 1, w, file);
+ fwrite(buf, bytespp, w, file);
buf += stride;
}
}
@@ -247,17 +246,22 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) {
int vpx_img_read(vpx_image_t *img, FILE *file) {
int plane;
+ const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
for (plane = 0; plane < 3; ++plane) {
unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int w = vpx_img_plane_width(img, plane) *
- ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+ int w = vpx_img_plane_width(img, plane);
const int h = vpx_img_plane_height(img, plane);
int y;
+ // Assuming that for nv12 we read all chroma data at once
+ if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+ // Fixing NV12 chroma width if it is odd
+ if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
for (y = 0; y < h; ++y) {
- if (fread(buf, 1, w, file) != (size_t)w) return 0;
+ if (fread(buf, bytespp, w, file) != (size_t)w) return 0;
buf += stride;
}
}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
index ee3c281f0f..a54e81084b 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -16,7 +16,7 @@
#include "vpx_ports/mem.h"
static const int8_t vp8_sub_pel_filters[8][8] = {
- { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */
+ { 0, 0, -128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */
{ 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */
{ 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
{ 0, -9, 93, 50, -6, 0, 0, 0 },
diff --git a/media/libvpx/libvpx/vp8/common/entropy.c b/media/libvpx/libvpx/vp8/common/entropy.c
index fc4a3539fd..b9efc0cc1f 100644
--- a/media/libvpx/libvpx/vp8/common/entropy.c
+++ b/media/libvpx/libvpx/vp8/common/entropy.c
@@ -114,7 +114,7 @@ static const vp8_prob Pcat6[] = { 254, 254, 243, 230, 196, 177,
p[0] = p[1] = 0;
}
- void init_bit_trees() {
+ void init_bit_trees(void) {
init_bit_tree(cat1, 1);
init_bit_tree(cat2, 2);
init_bit_tree(cat3, 3);
diff --git a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
index 71529bdfd8..7c8e083f4f 100644
--- a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
+++ b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
@@ -25,23 +25,19 @@
#include "vp8/common/systemdependent.h"
#if CONFIG_MULTITHREAD
-#if HAVE_UNISTD_H && !defined(__OS2__)
+#if HAVE_UNISTD_H
#include <unistd.h>
#elif defined(_WIN32)
#include <windows.h>
typedef void(WINAPI *PGNSI)(LPSYSTEM_INFO);
-#elif defined(__OS2__)
-#define INCL_DOS
-#define INCL_DOSSPINLOCK
-#include <os2.h>
#endif
#endif
#if CONFIG_MULTITHREAD
-static int get_cpu_count() {
+static int get_cpu_count(void) {
int core_count = 16;
-#if HAVE_UNISTD_H && !defined(__OS2__)
+#if HAVE_UNISTD_H
#if defined(_SC_NPROCESSORS_ONLN)
core_count = (int)sysconf(_SC_NPROCESSORS_ONLN);
#elif defined(_SC_NPROC_ONLN)
@@ -49,38 +45,13 @@ static int get_cpu_count() {
#endif
#elif defined(_WIN32)
{
-#if _WIN32_WINNT >= 0x0501
+#if _WIN32_WINNT < 0x0501
+#error _WIN32_WINNT must target Windows XP or newer.
+#endif
SYSTEM_INFO sysinfo;
GetNativeSystemInfo(&sysinfo);
-#else
- PGNSI pGNSI;
- SYSTEM_INFO sysinfo;
-
- /* Call GetNativeSystemInfo if supported or
- * GetSystemInfo otherwise. */
-
- pGNSI = (PGNSI)GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
- "GetNativeSystemInfo");
- if (pGNSI != NULL)
- pGNSI(&sysinfo);
- else
- GetSystemInfo(&sysinfo);
-#endif
-
core_count = (int)sysinfo.dwNumberOfProcessors;
}
-#elif defined(__OS2__)
- {
- ULONG proc_id;
- ULONG status;
-
- core_count = 0;
- for (proc_id = 1;; ++proc_id) {
- if (DosGetProcessorStatus(proc_id, &status)) break;
-
- if (status == PROC_ONLINE) core_count++;
- }
- }
#else
/* other platforms */
#endif
diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h
index 1b70ea5dba..2038c000b0 100644
--- a/media/libvpx/libvpx/vp8/common/onyx.h
+++ b/media/libvpx/libvpx/vp8/common/onyx.h
@@ -242,7 +242,7 @@ typedef struct {
#endif
} VP8_CONFIG;
-void vp8_initialize();
+void vp8_initialize(void);
struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf);
void vp8_remove_compressor(struct VP8_COMP **comp);
diff --git a/media/libvpx/libvpx/vp8/common/rtcd.c b/media/libvpx/libvpx/vp8/common/rtcd.c
index 09a0e2b4b3..102b7ccd54 100644
--- a/media/libvpx/libvpx/vp8/common/rtcd.c
+++ b/media/libvpx/libvpx/vp8/common/rtcd.c
@@ -12,4 +12,4 @@
#include "./vp8_rtcd.h"
#include "vpx_ports/vpx_once.h"
-void vp8_rtcd() { once(setup_rtcd_internal); }
+void vp8_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp8/common/threading.h b/media/libvpx/libvpx/vp8/common/threading.h
index 1cfb9fec51..0de75cfde3 100644
--- a/media/libvpx/libvpx/vp8/common/threading.h
+++ b/media/libvpx/libvpx/vp8/common/threading.h
@@ -19,161 +19,57 @@ extern "C" {
#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
-/* Thread management macros */
#if defined(_WIN32) && !HAVE_PTHREAD_H
/* Win32 */
-#include <process.h>
#include <windows.h>
-#if defined(__GNUC__) && \
- (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define THREAD_FUNCTION \
- __attribute__((force_align_arg_pointer)) unsigned int __stdcall
-#else
-#define THREAD_FUNCTION unsigned int __stdcall
-#endif
-#define THREAD_FUNCTION_RETURN DWORD
-#define THREAD_SPECIFIC_INDEX DWORD
-#define pthread_t HANDLE
-#define pthread_attr_t DWORD
-#define pthread_detach(thread) \
- if (thread != NULL) CloseHandle(thread)
-#define thread_sleep(nms) Sleep(nms)
-#define pthread_cancel(thread) terminate_thread(thread, 0)
-#define ts_key_create(ts_key, destructor) \
- { ts_key = TlsAlloc(); };
-#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
-#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
-#define pthread_self() GetCurrentThreadId()
-
-#elif defined(__OS2__)
-/* OS/2 */
-#define INCL_DOS
-#include <os2.h>
-
-#include <stdlib.h>
-#define THREAD_FUNCTION void *
-#define THREAD_FUNCTION_RETURN void *
-#define THREAD_SPECIFIC_INDEX PULONG
-#define pthread_t TID
-#define pthread_attr_t ULONG
-#define pthread_detach(thread) 0
-#define thread_sleep(nms) DosSleep(nms)
-#define pthread_cancel(thread) DosKillThread(thread)
-#define ts_key_create(ts_key, destructor) \
- DosAllocThreadLocalMemory(1, &(ts_key));
-#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
-#define pthread_setspecific(ts_key, value) (*(ts_key) = (ULONG)(value))
-#define pthread_self() _gettid()
#else
+/* pthreads */
#ifdef __APPLE__
#include <mach/mach_init.h>
#include <mach/semaphore.h>
#include <mach/task.h>
#include <time.h>
#include <unistd.h>
-
#else
#include <semaphore.h>
#endif
-
-#include <pthread.h>
-/* pthreads */
-/* Nearly everything is already defined */
-#define THREAD_FUNCTION void *
-#define THREAD_FUNCTION_RETURN void *
-#define THREAD_SPECIFIC_INDEX pthread_key_t
-#define ts_key_create(ts_key, destructor) \
- pthread_key_create(&(ts_key), destructor);
#endif
/* Synchronization macros: Win32 and Pthreads */
#if defined(_WIN32) && !HAVE_PTHREAD_H
-#define sem_t HANDLE
-#define pause(voidpara) __asm PAUSE
-#define sem_init(sem, sem_attr1, sem_init_value) \
- (int)((*sem = CreateSemaphore(NULL, 0, 32768, NULL)) == NULL)
-#define sem_wait(sem) \
+#define vp8_sem_t HANDLE
+#define vp8_sem_init(sem, pshared, value) \
+ (int)((*sem = CreateSemaphore(NULL, value, 32768, NULL)) == NULL)
+#define vp8_sem_wait(sem) \
(int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem, INFINITE))
-#define sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
-#define sem_destroy(sem) \
+#define vp8_sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
+#define vp8_sem_destroy(sem) \
if (*sem) ((int)(CloseHandle(*sem)) == TRUE)
#define thread_sleep(nms) Sleep(nms)
-#elif defined(__OS2__)
-typedef struct {
- HEV event;
- HMTX wait_mutex;
- HMTX count_mutex;
- int count;
-} sem_t;
-
-static inline int sem_init(sem_t *sem, int pshared, unsigned int value) {
- DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
- value > 0 ? TRUE : FALSE);
- DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
- DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
-
- sem->count = value;
-
- return 0;
-}
-
-static inline int sem_wait(sem_t *sem) {
- DosRequestMutexSem(sem->wait_mutex, -1);
-
- DosWaitEventSem(sem->event, -1);
-
- DosRequestMutexSem(sem->count_mutex, -1);
-
- sem->count--;
- if (sem->count == 0) {
- ULONG post_count;
-
- DosResetEventSem(sem->event, &post_count);
- }
-
- DosReleaseMutexSem(sem->count_mutex);
-
- DosReleaseMutexSem(sem->wait_mutex);
-
- return 0;
-}
-
-static inline int sem_post(sem_t *sem) {
- DosRequestMutexSem(sem->count_mutex, -1);
-
- if (sem->count < 32768) {
- sem->count++;
- DosPostEventSem(sem->event);
- }
-
- DosReleaseMutexSem(sem->count_mutex);
-
- return 0;
-}
-
-static inline int sem_destroy(sem_t *sem) {
- DosCloseEventSem(sem->event);
- DosCloseMutexSem(sem->wait_mutex);
- DosCloseMutexSem(sem->count_mutex);
-
- return 0;
-}
-
-#define thread_sleep(nms) DosSleep(nms)
-
#else
#ifdef __APPLE__
-#define sem_t semaphore_t
-#define sem_init(X, Y, Z) \
- semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
-#define sem_wait(sem) (semaphore_wait(*sem))
-#define sem_post(sem) semaphore_signal(*sem)
-#define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
+#define vp8_sem_t semaphore_t
+#define vp8_sem_init(sem, pshared, value) \
+ semaphore_create(mach_task_self(), sem, SYNC_POLICY_FIFO, value)
+#define vp8_sem_wait(sem) semaphore_wait(*sem)
+#define vp8_sem_post(sem) semaphore_signal(*sem)
+#define vp8_sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
#else
+#include <errno.h>
#include <unistd.h>
#include <sched.h>
+#define vp8_sem_t sem_t
+#define vp8_sem_init sem_init
+static INLINE int vp8_sem_wait(vp8_sem_t *sem) {
+ int ret;
+ while ((ret = sem_wait(sem)) == -1 && errno == EINTR) {
+ }
+ return ret;
+}
+#define vp8_sem_post sem_post
+#define vp8_sem_destroy sem_destroy
#endif /* __APPLE__ */
/* Not Windows. Assume pthreads */
@@ -194,7 +90,6 @@ static inline int sem_destroy(sem_t *sem) {
#define x86_pause_hint()
#endif
-#include "vpx_util/vpx_thread.h"
#include "vpx_util/vpx_atomics.h"
static INLINE void vp8_atomic_spin_wait(
diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
index 2248345ba2..88f2de024b 100644
--- a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
+++ b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
@@ -428,6 +428,7 @@ int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) {
#if CONFIG_MULTITHREAD
if (setjmp(fb->pbi[0]->common.error.jmp)) {
+ fb->pbi[0]->common.error.setjmp = 0;
vp8_remove_decoder_instances(fb);
vp8_zero(fb->pbi);
vpx_clear_system_state();
@@ -452,6 +453,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb) {
/* decoder instance for single thread mode */
remove_decompressor(pbi);
+ fb->pbi[0] = NULL;
return VPX_CODEC_OK;
}
diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
index 1070849620..08a60b31b9 100644
--- a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
+++ b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
@@ -14,6 +14,7 @@
#include <assert.h>
#include "vpx_config.h"
+#include "vpx_util/vpx_pthread.h"
#include "vp8/common/onyxd.h"
#include "treereader.h"
#include "vp8/common/onyxc_int.h"
@@ -94,8 +95,8 @@ typedef struct VP8D_COMP {
DECODETHREAD_DATA *de_thread_data;
pthread_t *h_decoding_thread;
- sem_t *h_event_start_decoding;
- sem_t h_event_end_decoding;
+ vp8_sem_t *h_event_start_decoding;
+ vp8_sem_t h_event_end_decoding;
/* end of threading data */
#endif
diff --git a/media/libvpx/libvpx/vp8/decoder/threading.c b/media/libvpx/libvpx/vp8/decoder/threading.c
index 6ccb080cf9..d16284d134 100644
--- a/media/libvpx/libvpx/vp8/decoder/threading.c
+++ b/media/libvpx/libvpx/vp8/decoder/threading.c
@@ -15,6 +15,7 @@
#endif
#include "onyxd_int.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
#include "vp8/common/common.h"
#include "vp8/common/threading.h"
#include "vp8/common/loopfilter.h"
@@ -577,10 +578,10 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
/* signal end of decoding of current thread for current frame */
if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows)
- sem_post(&pbi->h_event_end_decoding);
+ vp8_sem_post(&pbi->h_event_end_decoding);
}
-static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
+static THREADFN thread_decoding_proc(void *p_data) {
int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
@@ -589,7 +590,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
while (1) {
if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break;
- if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
+ if (vp8_sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) {
break;
} else {
@@ -598,16 +599,17 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
if (setjmp(xd->error_info.jmp)) {
xd->error_info.setjmp = 0;
// Signal the end of decoding for current thread.
- sem_post(&pbi->h_event_end_decoding);
+ vp8_sem_post(&pbi->h_event_end_decoding);
continue;
}
xd->error_info.setjmp = 1;
mt_decode_mb_rows(pbi, xd, ithread + 1);
+ xd->error_info.setjmp = 0;
}
}
}
- return 0;
+ return THREAD_EXIT_SUCCESS;
}
void vp8_decoder_create_threads(VP8D_COMP *pbi) {
@@ -634,13 +636,13 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
- if (sem_init(&pbi->h_event_end_decoding, 0, 0)) {
+ if (vp8_sem_init(&pbi->h_event_end_decoding, 0, 0)) {
vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
"Failed to initialize semaphore");
}
for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) {
- if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
+ if (vp8_sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
@@ -650,7 +652,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
if (pthread_create(&pbi->h_decoding_thread[ithread], 0,
thread_decoding_proc, &pbi->de_thread_data[ithread])) {
- sem_destroy(&pbi->h_event_start_decoding[ithread]);
+ vp8_sem_destroy(&pbi->h_event_start_decoding[ithread]);
break;
}
}
@@ -661,7 +663,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
/* the remainder of cleanup cases will be handled in
* vp8_decoder_remove_threads(). */
if (pbi->allocated_decoding_thread_count == 0) {
- sem_destroy(&pbi->h_event_end_decoding);
+ vp8_sem_destroy(&pbi->h_event_end_decoding);
}
vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
"Failed to create threads");
@@ -812,16 +814,16 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
/* allow all threads to exit */
for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
- sem_post(&pbi->h_event_start_decoding[i]);
+ vp8_sem_post(&pbi->h_event_start_decoding[i]);
pthread_join(pbi->h_decoding_thread[i], NULL);
}
for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
- sem_destroy(&pbi->h_event_start_decoding[i]);
+ vp8_sem_destroy(&pbi->h_event_start_decoding[i]);
}
if (pbi->allocated_decoding_thread_count) {
- sem_destroy(&pbi->h_event_end_decoding);
+ vp8_sem_destroy(&pbi->h_event_end_decoding);
}
vpx_free(pbi->h_decoding_thread);
@@ -883,7 +885,7 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
pbi->decoding_thread_count);
for (i = 0; i < pbi->decoding_thread_count; ++i) {
- sem_post(&pbi->h_event_start_decoding[i]);
+ vp8_sem_post(&pbi->h_event_start_decoding[i]);
}
if (setjmp(xd->error_info.jmp)) {
@@ -893,15 +895,16 @@ int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
// the current frame while the main thread starts decoding the next frame,
// which causes a data race.
for (i = 0; i < pbi->decoding_thread_count; ++i)
- sem_wait(&pbi->h_event_end_decoding);
+ vp8_sem_wait(&pbi->h_event_end_decoding);
return -1;
}
xd->error_info.setjmp = 1;
mt_decode_mb_rows(pbi, xd, 0);
+ xd->error_info.setjmp = 0;
for (i = 0; i < pbi->decoding_thread_count + 1; ++i)
- sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
+ vp8_sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
return 0;
}
diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.c b/media/libvpx/libvpx/vp8/encoder/encodeframe.c
index 82c48b13a7..d0117897db 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodeframe.c
+++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.c
@@ -7,38 +7,38 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <stdio.h>
#include <limits.h>
+#include <stdio.h>
#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "bitstream.h"
-#include "encodemb.h"
-#include "encodemv.h"
-#if CONFIG_MULTITHREAD
-#include "ethreading.h"
-#endif
+
#include "vp8/common/common.h"
-#include "onyx_int.h"
-#include "vp8/common/extend.h"
#include "vp8/common/entropymode.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vp8/common/setupintrarecon.h"
-#include "encodeintra.h"
-#include "vp8/common/reconinter.h"
-#include "rdopt.h"
-#include "pickinter.h"
+#include "vp8/common/extend.h"
#include "vp8/common/findnearmv.h"
#include "vp8/common/invtrans.h"
+#include "vp8/common/quant_common.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/setupintrarecon.h"
+#include "vp8/common/threading.h"
+#include "vp8/encoder/bitstream.h"
+#include "vp8/encoder/encodeframe.h"
+#include "vp8/encoder/encodeintra.h"
+#include "vp8/encoder/encodemb.h"
+#include "vp8/encoder/encodemv.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/pickinter.h"
+#include "vp8/encoder/rdopt.h"
+#include "vp8/encoder/segmentation.h"
+#include "vp8_rtcd.h"
#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/vpx_timer.h"
-#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-#include "bitstream.h"
+
+#if CONFIG_MULTITHREAD
+#include "vp8/encoder/ethreading.h"
#endif
-#include "encodeframe.h"
extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
@@ -773,7 +773,7 @@ void vp8_encode_frame(VP8_COMP *cpi) {
vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
for (i = 0; i < cpi->encoding_thread_count; ++i) {
- sem_post(&cpi->h_event_start_encoding[i]);
+ vp8_sem_post(&cpi->h_event_start_encoding[i]);
}
for (mb_row = 0; mb_row < cm->mb_rows;
@@ -806,7 +806,7 @@ void vp8_encode_frame(VP8_COMP *cpi) {
}
/* Wait for all the threads to finish. */
for (i = 0; i < cpi->encoding_thread_count; ++i) {
- sem_wait(&cpi->h_event_end_encoding[i]);
+ vp8_sem_wait(&cpi->h_event_end_encoding[i]);
}
for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.c b/media/libvpx/libvpx/vp8/encoder/ethreading.c
index e2f8b89d46..98c87d3cbc 100644
--- a/media/libvpx/libvpx/vp8/encoder/ethreading.c
+++ b/media/libvpx/libvpx/vp8/encoder/ethreading.c
@@ -10,6 +10,7 @@
#include <stddef.h>
#include "onyx_int.h"
+#include "vpx_util/vpx_pthread.h"
#include "vp8/common/threading.h"
#include "vp8/common/common.h"
#include "vp8/common/extend.h"
@@ -22,27 +23,27 @@
extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x,
int ok_to_skip);
-static THREAD_FUNCTION thread_loopfilter(void *p_data) {
+static THREADFN thread_loopfilter(void *p_data) {
VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
VP8_COMMON *cm = &cpi->common;
while (1) {
if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
- if (sem_wait(&cpi->h_event_start_lpf) == 0) {
+ if (vp8_sem_wait(&cpi->h_event_start_lpf) == 0) {
/* we're shutting down */
if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
vp8_loopfilter_frame(cpi, cm);
- sem_post(&cpi->h_event_end_lpf);
+ vp8_sem_post(&cpi->h_event_end_lpf);
}
}
- return 0;
+ return THREAD_EXIT_SUCCESS;
}
-static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
+static THREADFN thread_encoding_proc(void *p_data) {
int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
@@ -51,7 +52,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
while (1) {
if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
- if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
+ if (vp8_sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
const int nsync = cpi->mt_sync_range;
VP8_COMMON *cm = &cpi->common;
int mb_row;
@@ -307,12 +308,12 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count;
}
/* Signal that this thread has completed processing its rows. */
- sem_post(&cpi->h_event_end_encoding[ithread]);
+ vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
}
}
/* printf("exit thread %d\n", ithread); */
- return 0;
+ return THREAD_EXIT_SUCCESS;
}
static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) {
@@ -514,9 +515,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
vpx_malloc(sizeof(pthread_t) * th_count));
CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding,
- vpx_malloc(sizeof(sem_t) * th_count));
+ vpx_malloc(sizeof(vp8_sem_t) * th_count));
CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding,
- vpx_malloc(sizeof(sem_t) * th_count));
+ vpx_malloc(sizeof(vp8_sem_t) * th_count));
CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei,
vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
@@ -538,8 +539,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb);
vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd);
- sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
- sem_init(&cpi->h_event_end_encoding[ithread], 0, 0);
+ vp8_sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+ vp8_sem_init(&cpi->h_event_end_encoding[ithread], 0, 0);
ethd->ithread = ithread;
ethd->ptr1 = (void *)cpi;
@@ -554,11 +555,11 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
/* shutdown other threads */
vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
for (--ithread; ithread >= 0; ithread--) {
- sem_post(&cpi->h_event_start_encoding[ithread]);
- sem_post(&cpi->h_event_end_encoding[ithread]);
+ vp8_sem_post(&cpi->h_event_start_encoding[ithread]);
+ vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
pthread_join(cpi->h_encoding_thread[ithread], 0);
- sem_destroy(&cpi->h_event_start_encoding[ithread]);
- sem_destroy(&cpi->h_event_end_encoding[ithread]);
+ vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]);
+ vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]);
}
/* free thread related resources */
@@ -580,8 +581,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
{
LPFTHREAD_DATA *lpfthd = &cpi->lpf_thread_data;
- sem_init(&cpi->h_event_start_lpf, 0, 0);
- sem_init(&cpi->h_event_end_lpf, 0, 0);
+ vp8_sem_init(&cpi->h_event_start_lpf, 0, 0);
+ vp8_sem_init(&cpi->h_event_end_lpf, 0, 0);
lpfthd->ptr1 = (void *)cpi;
rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter, lpfthd);
@@ -590,14 +591,14 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
/* shutdown other threads */
vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
for (--ithread; ithread >= 0; ithread--) {
- sem_post(&cpi->h_event_start_encoding[ithread]);
- sem_post(&cpi->h_event_end_encoding[ithread]);
+ vp8_sem_post(&cpi->h_event_start_encoding[ithread]);
+ vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
pthread_join(cpi->h_encoding_thread[ithread], 0);
- sem_destroy(&cpi->h_event_start_encoding[ithread]);
- sem_destroy(&cpi->h_event_end_encoding[ithread]);
+ vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]);
+ vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]);
}
- sem_destroy(&cpi->h_event_end_lpf);
- sem_destroy(&cpi->h_event_start_lpf);
+ vp8_sem_destroy(&cpi->h_event_end_lpf);
+ vp8_sem_destroy(&cpi->h_event_start_lpf);
/* free thread related resources */
vpx_free(cpi->h_event_start_encoding);
@@ -627,21 +628,21 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
int i;
for (i = 0; i < cpi->encoding_thread_count; ++i) {
- sem_post(&cpi->h_event_start_encoding[i]);
- sem_post(&cpi->h_event_end_encoding[i]);
+ vp8_sem_post(&cpi->h_event_start_encoding[i]);
+ vp8_sem_post(&cpi->h_event_end_encoding[i]);
pthread_join(cpi->h_encoding_thread[i], 0);
- sem_destroy(&cpi->h_event_start_encoding[i]);
- sem_destroy(&cpi->h_event_end_encoding[i]);
+ vp8_sem_destroy(&cpi->h_event_start_encoding[i]);
+ vp8_sem_destroy(&cpi->h_event_end_encoding[i]);
}
- sem_post(&cpi->h_event_start_lpf);
+ vp8_sem_post(&cpi->h_event_start_lpf);
pthread_join(cpi->h_filter_thread, 0);
}
- sem_destroy(&cpi->h_event_end_lpf);
- sem_destroy(&cpi->h_event_start_lpf);
+ vp8_sem_destroy(&cpi->h_event_end_lpf);
+ vp8_sem_destroy(&cpi->h_event_start_lpf);
cpi->b_lpf_running = 0;
/* free thread related resources */
diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
index 4e128e3c49..ad01c6fc86 100644
--- a/media/libvpx/libvpx/vp8/encoder/onyx_if.c
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
@@ -63,7 +63,7 @@
extern int vp8_update_coef_context(VP8_COMP *cpi);
#endif
-extern unsigned int vp8_get_processor_freq();
+extern unsigned int vp8_get_processor_freq(void);
int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
@@ -267,7 +267,11 @@ static int rescale(int val, int num, int denom) {
int64_t llden = denom;
int64_t llval = val;
- return (int)(llval * llnum / llden);
+ int64_t result = (llval * llnum / llden);
+ if (result <= INT_MAX)
+ return (int)result;
+ else
+ return INT_MAX;
}
void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
@@ -276,7 +280,10 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
LAYER_CONTEXT *lc = &cpi->layer_context[layer];
lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
- lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
+ if (cpi->oxcf.target_bitrate[layer] > INT_MAX / 1000)
+ lc->target_bandwidth = INT_MAX;
+ else
+ lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
lc->optimal_buffer_level_in_ms = oxcf->optimal_buffer_level;
@@ -1381,7 +1388,10 @@ void vp8_update_layer_contexts(VP8_COMP *cpi) {
LAYER_CONTEXT *lc = &cpi->layer_context[i];
lc->framerate = cpi->ref_framerate / oxcf->rate_decimator[i];
- lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
+ if (oxcf->target_bitrate[i] > INT_MAX / 1000)
+ lc->target_bandwidth = INT_MAX;
+ else
+ lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
lc->starting_buffer_level = rescale(
(int)oxcf->starting_buffer_level_in_ms, lc->target_bandwidth, 1000);
@@ -1995,6 +2005,7 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
#if CONFIG_MULTITHREAD
if (vp8cx_create_encoder_threads(cpi)) {
+ cpi->common.error.setjmp = 0;
vp8_remove_compressor(&cpi);
return 0;
}
@@ -2048,8 +2059,6 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
vp8_loop_filter_init(cm);
- cpi->common.error.setjmp = 0;
-
#if CONFIG_MULTI_RES_ENCODING
/* Calculate # of MBs in a row in lower-resolution level image. */
@@ -2076,6 +2085,8 @@ struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
vp8_setup_block_ptrs(&cpi->mb);
vp8_setup_block_dptrs(&cpi->mb.e_mbd);
+ cpi->common.error.setjmp = 0;
+
return cpi;
}
@@ -3172,7 +3183,8 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
#if CONFIG_MULTITHREAD
if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
- sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+ /* signal that we have set filter_level */
+ vp8_sem_post(&cpi->h_event_end_lpf);
}
#endif
@@ -4387,11 +4399,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
#if CONFIG_MULTITHREAD
if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
/* start loopfilter in separate thread */
- sem_post(&cpi->h_event_start_lpf);
+ vp8_sem_post(&cpi->h_event_start_lpf);
cpi->b_lpf_running = 1;
/* wait for the filter_level to be picked so that we can continue with
* stream packing */
- sem_wait(&cpi->h_event_end_lpf);
+ vp8_sem_wait(&cpi->h_event_end_lpf);
} else
#endif
{
@@ -5120,6 +5132,14 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
vpx_usec_timer_mark(&cmptimer);
cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+#if CONFIG_MULTITHREAD
+ /* wait for the lpf thread done */
+ if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
+ vp8_sem_wait(&cpi->h_event_end_lpf);
+ cpi->b_lpf_running = 0;
+ }
+#endif
+
if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {
generate_psnr_packet(cpi);
}
@@ -5247,16 +5267,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
#endif
#endif
- cpi->common.error.setjmp = 0;
-
-#if CONFIG_MULTITHREAD
- /* wait for the lpf thread done */
- if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
- sem_wait(&cpi->h_event_end_lpf);
- cpi->b_lpf_running = 0;
- }
-#endif
-
return 0;
}
diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_int.h b/media/libvpx/libvpx/vp8/encoder/onyx_int.h
index 1451a27812..bb1518ed7f 100644
--- a/media/libvpx/libvpx/vp8/encoder/onyx_int.h
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_int.h
@@ -20,6 +20,7 @@
#include "tokenize.h"
#include "vp8/common/onyxc_int.h"
#include "vpx_dsp/variance.h"
+#include "vpx_util/vpx_pthread.h"
#include "encodemb.h"
#include "vp8/encoder/quantize.h"
#include "vp8/common/entropy.h"
@@ -540,10 +541,10 @@ typedef struct VP8_COMP {
LPFTHREAD_DATA lpf_thread_data;
/* events */
- sem_t *h_event_start_encoding;
- sem_t *h_event_end_encoding;
- sem_t h_event_start_lpf;
- sem_t h_event_end_lpf;
+ vp8_sem_t *h_event_start_encoding;
+ vp8_sem_t *h_event_end_encoding;
+ vp8_sem_t h_event_start_lpf;
+ vp8_sem_t h_event_end_lpf;
#endif
TOKENLIST *tplist;
diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.c b/media/libvpx/libvpx/vp8/encoder/ratectrl.c
index fcd4eb04eb..7ba7a308ab 100644
--- a/media/libvpx/libvpx/vp8/encoder/ratectrl.c
+++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.c
@@ -791,8 +791,12 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
(int)((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) /
one_percent_bits);
} else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) {
- percent_high =
- (int)((100 * cpi->bits_off_target) / (cpi->total_byte_count * 8));
+ if (cpi->total_byte_count > 0) {
+ percent_high = (int)((100 * cpi->bits_off_target) /
+ (cpi->total_byte_count * 8));
+ } else {
+ percent_high = cpi->oxcf.over_shoot_pct;
+ }
}
if (percent_high > cpi->oxcf.over_shoot_pct) {
@@ -1190,10 +1194,13 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
/* Calculate required scaling factor based on target frame size and
* size of frame produced using previous Q
*/
- if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) {
- /* Case where we would overflow int */
- target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs)
- << BPER_MB_NORMBITS;
+ if (target_bits_per_frame > (INT_MAX >> BPER_MB_NORMBITS)) {
+ int temp = target_bits_per_frame / cpi->common.MBs;
+ if (temp > (INT_MAX >> BPER_MB_NORMBITS)) {
+ target_bits_per_mb = INT_MAX;
+ } else {
+ target_bits_per_mb = temp << BPER_MB_NORMBITS;
+ }
} else {
target_bits_per_mb =
(target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
@@ -1534,9 +1541,13 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
// undershoots significantly, and then we end up dropping every other
// frame because the QP/rate_correction_factor may have been too low
// before the drop and then takes too long to come up.
- if (target_size >= (INT_MAX >> BPER_MB_NORMBITS)) {
- target_bits_per_mb = (target_size / cpi->common.MBs)
- << BPER_MB_NORMBITS;
+ if (target_size > (INT_MAX >> BPER_MB_NORMBITS)) {
+ int temp = target_size / cpi->common.MBs;
+ if (temp > (INT_MAX >> BPER_MB_NORMBITS)) {
+ target_bits_per_mb = INT_MAX;
+ } else {
+ target_bits_per_mb = temp << BPER_MB_NORMBITS;
+ }
} else {
target_bits_per_mb =
(target_size << BPER_MB_NORMBITS) / cpi->common.MBs;
diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.h b/media/libvpx/libvpx/vp8/encoder/tokenize.h
index 47b5be17f1..5223aa2d86 100644
--- a/media/libvpx/libvpx/vp8/encoder/tokenize.h
+++ b/media/libvpx/libvpx/vp8/encoder/tokenize.h
@@ -18,8 +18,6 @@
extern "C" {
#endif
-void vp8_tokenize_initialize();
-
typedef struct {
short Token;
short Extra;
diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
index 1f16cc53d3..2b238c1a97 100644
--- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
@@ -8,6 +8,11 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <limits.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
#include "./vpx_config.h"
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
@@ -18,6 +23,7 @@
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/static_assert.h"
#include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_thread.h"
#include "vpx_util/vpx_timestamp.h"
#if CONFIG_MULTITHREAD
#include "vp8/encoder/ethreading.h"
@@ -27,8 +33,6 @@
#include "vp8/encoder/firstpass.h"
#include "vp8/common/onyx.h"
#include "vp8/common/common.h"
-#include <stdlib.h>
-#include <string.h>
struct vp8_extracfg {
struct vpx_codec_pkt_list *pkt_list;
@@ -148,7 +152,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(cfg, g_profile, 3);
RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
- RANGE_CHECK_HI(cfg, g_threads, 64);
+ RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
#if CONFIG_REALTIME_ONLY
RANGE_CHECK_HI(cfg, g_lag_in_frames, 0);
#elif CONFIG_MULTI_RES_ENCODING
@@ -495,7 +499,10 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
vp8_change_config(ctx->cpi, &ctx->oxcf);
#if CONFIG_MULTITHREAD
- if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR;
+ if (vp8cx_create_encoder_threads(ctx->cpi)) {
+ ctx->cpi->common.error.setjmp = 0;
+ return VPX_CODEC_ERROR;
+ }
#endif
ctx->cpi->common.error.setjmp = 0;
return VPX_CODEC_OK;
@@ -777,9 +784,9 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
return res;
}
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
- unsigned long duration,
- vpx_enc_deadline_t deadline) {
+static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+ unsigned long duration,
+ vpx_enc_deadline_t deadline) {
int new_qc;
#if !(CONFIG_REALTIME_ONLY)
@@ -788,13 +795,15 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
if (deadline) {
/* Convert duration parameter from stream timebase to microseconds */
- uint64_t duration_us;
-
VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 &&
(TICKS_PER_SEC % 1000000) == 0);
- duration_us = duration * (uint64_t)ctx->timestamp_ratio.num /
- (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
+ if (duration > UINT64_MAX / (uint64_t)ctx->timestamp_ratio.num) {
+ ERROR("duration is too big");
+ }
+ uint64_t duration_us =
+ duration * (uint64_t)ctx->timestamp_ratio.num /
+ ((uint64_t)ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
/* If the deadline is more that the duration this frame is to be shown,
* use good quality mode. Otherwise use realtime mode.
@@ -820,6 +829,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
ctx->oxcf.Mode = new_qc;
vp8_change_config(ctx->cpi, &ctx->oxcf);
}
+ return VPX_CODEC_OK;
}
static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
@@ -894,13 +904,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
- if (!ctx->pts_offset_initialized) {
- ctx->pts_offset = pts_val;
- ctx->pts_offset_initialized = 1;
- }
- pts_val -= ctx->pts_offset;
-
- pick_quickcompress_mode(ctx, duration, deadline);
+ if (!res) res = pick_quickcompress_mode(ctx, duration, deadline);
vpx_codec_pkt_list_init(&ctx->pkt_list);
// If no flags are set in the encode call, then use the frame flags as
@@ -924,7 +928,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
/* Initialize the encoder instance on the first frame*/
if (!res && ctx->cpi) {
unsigned int lib_flags;
- YV12_BUFFER_CONFIG sd;
int64_t dst_time_stamp, dst_end_time_stamp;
size_t size, cx_data_sz;
unsigned char *cx_data;
@@ -951,12 +954,44 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
/* Convert API flags to internal codec lib flags */
lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
- dst_time_stamp =
- pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
- dst_end_time_stamp = (pts_val + (int64_t)duration) *
- ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
-
if (img != NULL) {
+ YV12_BUFFER_CONFIG sd;
+
+ if (!ctx->pts_offset_initialized) {
+ ctx->pts_offset = pts_val;
+ ctx->pts_offset_initialized = 1;
+ }
+ if (pts_val < ctx->pts_offset) {
+ vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "pts is smaller than initial pts");
+ }
+ pts_val -= ctx->pts_offset;
+ if (pts_val > INT64_MAX / ctx->timestamp_ratio.num) {
+ vpx_internal_error(
+ &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "conversion of relative pts to ticks would overflow");
+ }
+ dst_time_stamp =
+ pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+#if ULONG_MAX > INT64_MAX
+ if (duration > INT64_MAX) {
+ vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "duration is too big");
+ }
+#endif
+ if (pts_val > INT64_MAX - (int64_t)duration) {
+ vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "relative pts + duration is too big");
+ }
+ vpx_codec_pts_t pts_end = pts_val + (int64_t)duration;
+ if (pts_end > INT64_MAX / ctx->timestamp_ratio.num) {
+ vpx_internal_error(
+ &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "conversion of relative pts + duration to ticks would overflow");
+ }
+ dst_end_time_stamp =
+ pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+
res = image2yuvconfig(img, &sd);
if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
@@ -989,6 +1024,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
&dst_end_time_stamp, !img);
if (comp_data_state == VPX_CODEC_CORRUPT_FRAME) {
+ ctx->cpi->common.error.setjmp = 0;
return VPX_CODEC_CORRUPT_FRAME;
} else if (comp_data_state == -1) {
break;
diff --git a/media/libvpx/libvpx/vp8/vp8_dx_iface.c b/media/libvpx/libvpx/vp8/vp8_dx_iface.c
index e81deaf4ea..fa7d7be403 100644
--- a/media/libvpx/libvpx/vp8/vp8_dx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_dx_iface.c
@@ -488,7 +488,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) {
pc->fb_idx_ref_cnt[pc->new_fb_idx]--;
}
- pc->error.setjmp = 0;
+ pbi->common.error.setjmp = 0;
#if CONFIG_MULTITHREAD
if (pbi->restart_threads) {
ctx->si.w = 0;
diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
index 261c316fd1..312092f190 100644
--- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
@@ -8,10 +8,13 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "vp8/vp8_ratectrl_rtc.h"
+
#include <math.h>
+
#include <new>
+
#include "vp8/common/common.h"
-#include "vp8/vp8_ratectrl_rtc.h"
#include "vp8/encoder/onyx_int.h"
#include "vp8/encoder/ratectrl.h"
#include "vpx_ports/system_state.h"
@@ -311,6 +314,14 @@ FrameDropDecision VP8RateControlRTC::ComputeQP(
int VP8RateControlRTC::GetQP() const { return q_; }
+UVDeltaQP VP8RateControlRTC::GetUVDeltaQP() const {
+ VP8_COMMON *cm = &cpi_->common;
+ UVDeltaQP uv_delta_q;
+ uv_delta_q.uvdc_delta_q = cm->uvdc_delta_q;
+ uv_delta_q.uvac_delta_q = cm->uvac_delta_q;
+ return uv_delta_q;
+}
+
int VP8RateControlRTC::GetLoopfilterLevel() const {
VP8_COMMON *cm = &cpi_->common;
const double qp = q_;
diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
index 59fb607526..b458b5ce65 100644
--- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
@@ -21,7 +21,6 @@ struct VP8_COMP;
namespace libvpx {
struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
- public:
VP8RateControlRtcConfig() {
memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
@@ -42,6 +41,9 @@ class VP8RateControlRTC {
bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
// GetQP() needs to be called after ComputeQP() to get the latest QP
int GetQP() const;
+ // GetUVDeltaQP() needs to be called after ComputeQP() to get the latest
+ // delta QP for UV.
+ UVDeltaQP GetUVDeltaQP() const;
// GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter
// level is calculated from frame qp.
int GetLoopfilterLevel() const;
@@ -53,10 +55,10 @@ class VP8RateControlRTC {
void PostEncodeUpdate(uint64_t encoded_frame_size);
private:
- VP8RateControlRTC() {}
+ VP8RateControlRTC() = default;
bool InitRateControl(const VP8RateControlRtcConfig &cfg);
- struct VP8_COMP *cpi_;
- int q_;
+ struct VP8_COMP *cpi_ = nullptr;
+ int q_ = -1;
};
} // namespace libvpx
diff --git a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
index 1cfc12f6fa..4c8fcf6989 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
@@ -13,7 +13,6 @@
#include "./vpx_config.h"
#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_util/vpx_thread.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_loopfilter.h"
diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
index 37762ca15a..1a93b97e56 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
@@ -12,4 +12,4 @@
#include "./vp9_rtcd.h"
#include "vpx_ports/vpx_once.h"
-void vp9_rtcd() { once(setup_rtcd_internal); }
+void vp9_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
index 3ecbd5417f..af3ff0e980 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -129,7 +129,7 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
-specialize qw/vp9_block_error_fp neon avx2 sse2/;
+specialize qw/vp9_block_error_fp neon sve avx2 sse2/;
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
@@ -138,12 +138,12 @@ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t
specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- specialize qw/vp9_block_error neon avx2 sse2/;
+ specialize qw/vp9_block_error neon sve avx2 sse2/;
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
specialize qw/vp9_highbd_block_error neon sse2/;
} else {
- specialize qw/vp9_block_error neon avx2 msa sse2/;
+ specialize qw/vp9_block_error neon sve avx2 msa sse2/;
}
# fdct functions
diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
index 8df18af3b8..24adbcbff0 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
@@ -13,6 +13,7 @@
#include "./vpx_config.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_thread_common.h"
#include "vp9/common/vp9_reconinter.h"
diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
index 5df0117f12..96c705d0d5 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
@@ -12,6 +12,7 @@
#define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_
#include "./vpx_config.h"
#include "vp9/common/vp9_loopfilter.h"
+#include "vpx_util/vpx_pthread.h"
#include "vpx_util/vpx_thread.h"
#ifdef __cplusplus
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
index c5892156f4..4fe680cefc 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -22,6 +22,7 @@
#include "vpx_ports/mem.h"
#include "vpx_ports/mem_ops.h"
#include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_pthread.h"
#include "vpx_util/vpx_thread.h"
#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
#include "vpx_util/vpx_debug_util.h"
@@ -2292,6 +2293,7 @@ static INLINE void init_mt(VP9Decoder *pbi) {
++pbi->num_tile_workers;
winterface->init(worker);
+ worker->thread_name = "vpx tile worker";
if (n < num_threads - 1 && !winterface->reset(worker)) {
do {
winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]);
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
index 5a7e9f9ab3..5c77df5002 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
@@ -21,6 +21,7 @@
#include "vpx_ports/vpx_once.h"
#include "vpx_ports/vpx_timer.h"
#include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_pthread.h"
#include "vpx_util/vpx_thread.h"
#include "vp9/common/vp9_alloccommon.h"
@@ -210,6 +211,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
cm->error.setjmp = 0;
vpx_get_worker_interface()->init(&pbi->lf_worker);
+ pbi->lf_worker.thread_name = "vpx lf worker";
return pbi;
}
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
index 2e198d552e..b3ee4eab5f 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
@@ -16,6 +16,7 @@
#include "vpx/vpx_codec.h"
#include "vpx_dsp/bitreader.h"
#include "vpx_scale/yv12config.h"
+#include "vpx_util/vpx_pthread.h"
#include "vpx_util/vpx_thread.h"
#include "vp9/common/vp9_thread_common.h"
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
index 9a31f5a6d0..926ae87739 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
@@ -12,6 +12,7 @@
#include <string.h>
#include "vpx/vpx_integer.h"
+#include "vpx_util/vpx_pthread.h"
#include "vp9/decoder/vp9_job_queue.h"
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
index bc23bf9c2c..59f71fb9ba 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
@@ -11,7 +11,7 @@
#ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
#define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
-#include "vpx_util/vpx_thread.h"
+#include "vpx_util/vpx_pthread.h"
typedef struct {
// Pointer to buffer base which contains the jobs
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
new file mode 100644
index 0000000000..78e7361d85
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ int64x2_t err_v = vdupq_n_s64(0);
+ int64x2_t ssz_v = vdupq_n_s64(0);
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const int16x8_t diff0 = vabdq_s16(c0, d0);
+ const int16x8_t diff1 = vabdq_s16(c1, d1);
+
+ err_v = vpx_dotq_s16(err_v, diff0, diff0);
+ err_v = vpx_dotq_s16(err_v, diff1, diff1);
+
+ ssz_v = vpx_dotq_s16(ssz_v, c0, c0);
+ ssz_v = vpx_dotq_s16(ssz_v, c1, c1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ *ssz = horizontal_add_int64x2(ssz_v);
+ return horizontal_add_int64x2(err_v);
+}
+
+int64_t vp9_block_error_fp_sve(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, int block_size) {
+ int64x2_t err = vdupq_n_s64(0);
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const int16x8_t diff0 = vabdq_s16(c0, d0);
+ const int16x8_t diff1 = vabdq_s16(c1, d1);
+
+ err = vpx_dotq_s16(err, diff0, diff0);
+ err = vpx_dotq_s16(err, diff1, diff1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ return horizontal_add_int64x2(err);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
index 7fa00cd194..6542794667 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_block.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
@@ -11,8 +11,6 @@
#ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_
#define VPX_VP9_ENCODER_VP9_BLOCK_H_
-#include "vpx_util/vpx_thread.h"
-
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_entropy.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
index 42073f756c..ee0fcd8729 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
@@ -119,8 +119,8 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
PC_TREE *const tree = &td->pc_tree[pc_tree_index];
tree->block_size = square[0];
alloc_tree_contexts(cm, tree, 4);
- tree->leaf_split[0] = this_leaf++;
- for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
+ tree->u.leaf_split[0] = this_leaf++;
+ for (j = 1; j < 4; j++) tree->u.leaf_split[j] = tree->u.leaf_split[0];
}
// Each node has 4 leaf nodes, fill each block_size level of the tree
@@ -130,7 +130,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
PC_TREE *const tree = &td->pc_tree[pc_tree_index];
alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
tree->block_size = square[square_index];
- for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+ for (j = 0; j < 4; j++) tree->u.split[j] = this_pc++;
++pc_tree_index;
}
++square_index;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
index 4e301cc17d..51e13ba654 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
@@ -90,7 +90,7 @@ typedef struct PC_TREE {
union {
struct PC_TREE *split[4];
PICK_MODE_CONTEXT *leaf_split[4];
- };
+ } u;
// Obtained from a simple motion search. Used by the ML based partition search
// speed feature.
MV mv;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
index 46291f4868..b24c85f406 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -21,7 +21,7 @@
#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_timer.h"
#include "vpx_ports/system_state.h"
-
+#include "vpx_util/vpx_pthread.h"
#if CONFIG_MISMATCH_DEBUG
#include "vpx_util/vpx_debug_util.h"
#endif // CONFIG_MISMATCH_DEBUG
@@ -2303,16 +2303,16 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
assert(partition == PARTITION_SPLIT);
if (bsize == BLOCK_8X8) {
encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
- pc_tree->leaf_split[0]);
+ pc_tree->u.leaf_split[0]);
} else {
encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
- pc_tree->split[0]);
+ pc_tree->u.split[0]);
encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
- subsize, pc_tree->split[1]);
+ subsize, pc_tree->u.split[1]);
encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
- subsize, pc_tree->split[2]);
+ subsize, pc_tree->u.split[2]);
encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
- subsize, pc_tree->split[3]);
+ subsize, pc_tree->u.split[3]);
}
break;
}
@@ -2645,13 +2645,13 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
assert(partition == PARTITION_SPLIT);
subsize = get_subsize(bsize, PARTITION_SPLIT);
encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
- pc_tree->split[0]);
+ pc_tree->u.split[0]);
encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
- subsize, pc_tree->split[1]);
+ subsize, pc_tree->u.split[1]);
encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
- subsize, pc_tree->split[2]);
+ subsize, pc_tree->u.split[2]);
encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
- output_enabled, subsize, pc_tree->split[3]);
+ output_enabled, subsize, pc_tree->u.split[3]);
break;
}
@@ -2801,7 +2801,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
assert(partition == PARTITION_SPLIT);
if (bsize == BLOCK_8X8) {
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
- subsize, pc_tree->leaf_split[0], INT_MAX, INT64_MAX);
+ subsize, pc_tree->u.leaf_split[0], INT_MAX, INT64_MAX);
break;
}
last_part_rdc.rate = 0;
@@ -2819,7 +2819,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss,
tp, mi_row + y_idx, mi_col + x_idx, subsize,
&tmp_rdc.rate, &tmp_rdc.dist, i != 3,
- pc_tree->split[i]);
+ pc_tree->u.split[i]);
if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
vp9_rd_cost_reset(&last_part_rdc);
break;
@@ -2860,9 +2860,9 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
continue;
save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
- pc_tree->split[i]->partitioning = PARTITION_NONE;
+ pc_tree->u.split[i]->partitioning = PARTITION_NONE;
rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
- &tmp_rdc, split_subsize, &pc_tree->split[i]->none,
+ &tmp_rdc, split_subsize, &pc_tree->u.split[i]->none,
INT_MAX, INT64_MAX);
restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2877,7 +2877,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
if (i != 3)
encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
- split_subsize, pc_tree->split[i]);
+ split_subsize, pc_tree->u.split[i]);
pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
split_subsize);
@@ -3391,7 +3391,7 @@ static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x,
features[feature_index++] = VPXMIN(rd_ratio, 2.0f);
for (i = 0; i < 4; ++i) {
- const int64_t this_rd = pc_tree->split[i]->none.rdcost;
+ const int64_t this_rd = pc_tree->u.split[i]->none.rdcost;
const int rd_valid = this_rd > 0 && this_rd < 1000000000;
// Ratio between sub-block RD and whole block RD.
features[feature_index++] =
@@ -3958,19 +3958,19 @@ static void store_superblock_info(
}
// recursively traverse partition tree when partition is split.
assert(pc_tree->partitioning == PARTITION_SPLIT);
- store_superblock_info(pc_tree->split[0], mi_grid_visible, mi_stride,
+ store_superblock_info(pc_tree->u.split[0], mi_grid_visible, mi_stride,
subblock_square_size_4x4, num_unit_rows, num_unit_cols,
row_start_4x4, col_start_4x4, partition_info,
motion_vector_info);
- store_superblock_info(pc_tree->split[1], mi_grid_visible, mi_stride,
+ store_superblock_info(pc_tree->u.split[1], mi_grid_visible, mi_stride,
subblock_square_size_4x4, num_unit_rows, num_unit_cols,
row_start_4x4, col_start_4x4 + subblock_square_size_4x4,
partition_info, motion_vector_info);
- store_superblock_info(pc_tree->split[2], mi_grid_visible, mi_stride,
+ store_superblock_info(pc_tree->u.split[2], mi_grid_visible, mi_stride,
subblock_square_size_4x4, num_unit_rows, num_unit_cols,
row_start_4x4 + subblock_square_size_4x4, col_start_4x4,
partition_info, motion_vector_info);
- store_superblock_info(pc_tree->split[3], mi_grid_visible, mi_stride,
+ store_superblock_info(pc_tree->u.split[3], mi_grid_visible, mi_stride,
subblock_square_size_4x4, num_unit_rows, num_unit_cols,
row_start_4x4 + subblock_square_size_4x4,
col_start_4x4 + subblock_square_size_4x4,
@@ -4114,7 +4114,7 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
vp9_zero(pc_tree->mv);
}
if (bsize > BLOCK_8X8) { // Store MV result as reference for subblocks.
- for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv;
+ for (i = 0; i < 4; ++i) pc_tree->u.split[i]->mv = pc_tree->mv;
}
}
@@ -4199,25 +4199,25 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
// PARTITION_SPLIT
// TODO(jingning): use the motion vectors given by the above search as
// the starting point of motion search in the following partition type check.
- pc_tree->split[0]->none.rdcost = 0;
- pc_tree->split[1]->none.rdcost = 0;
- pc_tree->split[2]->none.rdcost = 0;
- pc_tree->split[3]->none.rdcost = 0;
+ pc_tree->u.split[0]->none.rdcost = 0;
+ pc_tree->u.split[1]->none.rdcost = 0;
+ pc_tree->u.split[2]->none.rdcost = 0;
+ pc_tree->u.split[3]->none.rdcost = 0;
if (do_split || must_split) {
subsize = get_subsize(bsize, PARTITION_SPLIT);
load_pred_mv(x, ctx);
if (bsize == BLOCK_8X8) {
i = 4;
if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
- pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter;
+ pc_tree->u.leaf_split[0]->pred_interp_filter = pred_interp_filter;
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
- pc_tree->leaf_split[0], best_rdc.rate, best_rdc.dist);
+ pc_tree->u.leaf_split[0], best_rdc.rate, best_rdc.dist);
if (sum_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
} else {
if (cpi->sf.prune_ref_frame_for_rect_partitions) {
- const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0];
- const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1];
+ const int ref1 = pc_tree->u.leaf_split[0]->mic.ref_frame[0];
+ const int ref2 = pc_tree->u.leaf_split[0]->mic.ref_frame[1];
for (i = 0; i < 4; ++i) {
ref_frames_used[i] |= (1 << ref1);
if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
@@ -4250,21 +4250,21 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
continue;
- pc_tree->split[i]->index = i;
+ pc_tree->u.split[i]->index = i;
if (cpi->sf.prune_ref_frame_for_rect_partitions)
- pc_tree->split[i]->none.rate = INT_MAX;
+ pc_tree->u.split[i]->none.rate = INT_MAX;
found_best_rd = rd_pick_partition(
cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
- &this_rdc, best_rdc_split, pc_tree->split[i]);
+ &this_rdc, best_rdc_split, pc_tree->u.split[i]);
if (found_best_rd == 0) {
sum_rdc.rdcost = INT64_MAX;
break;
} else {
if (cpi->sf.prune_ref_frame_for_rect_partitions &&
- pc_tree->split[i]->none.rate != INT_MAX) {
- const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0];
- const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1];
+ pc_tree->u.split[i]->none.rate != INT_MAX) {
+ const int ref1 = pc_tree->u.split[i]->none.mic.ref_frame[0];
+ const int ref2 = pc_tree->u.split[i]->none.mic.ref_frame[1];
ref_frames_used[i] |= (1 << ref1);
if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
}
@@ -4821,13 +4821,13 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
}
break;
case PARTITION_SPLIT: {
- fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+ fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->u.split[0]);
fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
- pc_tree->split[1]);
+ pc_tree->u.split[1]);
fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
- pc_tree->split[2]);
+ pc_tree->u.split[2]);
fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
- pc_tree->split[3]);
+ pc_tree->u.split[3]);
break;
}
default: break;
@@ -4845,7 +4845,8 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
if (bsize > BLOCK_8X8) {
BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
int i;
- for (i = 0; i < 4; ++i) pred_pixel_ready_reset(pc_tree->split[i], subsize);
+ for (i = 0; i < 4; ++i)
+ pred_pixel_ready_reset(pc_tree->u.split[i], subsize);
}
}
@@ -5046,9 +5047,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
continue;
load_pred_mv(x, ctx);
- nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
- mi_col + x_idx, subsize, &this_rdc, 0,
- best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+ nonrd_pick_partition(
+ cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ &this_rdc, 0, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->u.split[i]);
if (this_rdc.rate == INT_MAX) {
vp9_rd_cost_reset(&sum_rdc);
@@ -5281,10 +5282,10 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
subsize = get_subsize(bsize, PARTITION_SPLIT);
nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
subsize, output_enabled, rd_cost,
- pc_tree->split[0]);
+ pc_tree->u.split[0]);
nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
mi_col + hbs, subsize, output_enabled, &this_rdc,
- pc_tree->split[1]);
+ pc_tree->u.split[1]);
if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
rd_cost->rate += this_rdc.rate;
@@ -5292,7 +5293,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
}
nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp,
mi_row + hbs, mi_col, subsize, output_enabled,
- &this_rdc, pc_tree->split[2]);
+ &this_rdc, pc_tree->u.split[2]);
if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
rd_cost->rate += this_rdc.rate;
@@ -5300,7 +5301,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
}
nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
mi_row + hbs, mi_col + hbs, subsize,
- output_enabled, &this_rdc, pc_tree->split[3]);
+ output_enabled, &this_rdc, pc_tree->u.split[3]);
if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
rd_cost->rate += this_rdc.rate;
@@ -5400,21 +5401,21 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
subsize = get_subsize(bsize, PARTITION_SPLIT);
if (bsize == BLOCK_8X8) {
nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
- subsize, pc_tree->leaf_split[0]);
+ subsize, pc_tree->u.leaf_split[0]);
encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
- subsize, pc_tree->leaf_split[0]);
+ subsize, pc_tree->u.leaf_split[0]);
} else {
nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize,
- output_enabled, dummy_cost, pc_tree->split[0]);
+ output_enabled, dummy_cost, pc_tree->u.split[0]);
nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
mi_col + hbs, subsize, output_enabled, dummy_cost,
- pc_tree->split[1]);
+ pc_tree->u.split[1]);
nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
mi_row + hbs, mi_col, subsize, output_enabled,
- dummy_cost, pc_tree->split[2]);
+ dummy_cost, pc_tree->u.split[2]);
nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
mi_row + hbs, mi_col + hbs, subsize, output_enabled,
- dummy_cost, pc_tree->split[3]);
+ dummy_cost, pc_tree->u.split[3]);
}
break;
}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
index fd213f1e6b..3b8b5345f1 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
@@ -31,12 +31,14 @@
#include "vpx_ports/system_state.h"
#include "vpx_ports/vpx_once.h"
#include "vpx_ports/vpx_timer.h"
+#include "vpx_util/vpx_pthread.h"
#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
#include "vpx_util/vpx_debug_util.h"
#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_enums.h"
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_idct.h"
#if CONFIG_VP9_POSTPROC
@@ -2135,24 +2137,22 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
cpi->external_resize = 1;
}
- if (cpi->initial_width) {
- int new_mi_size = 0;
- vp9_set_mb_mi(cm, cm->width, cm->height);
- new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
- if (cm->mi_alloc_size < new_mi_size) {
- vp9_free_context_buffers(cm);
- vp9_free_pc_tree(&cpi->td);
- vpx_free(cpi->mbmi_ext_base);
- alloc_compressor_data(cpi);
- realloc_segmentation_maps(cpi);
- cpi->initial_width = cpi->initial_height = 0;
- cpi->external_resize = 0;
- } else if (cm->mi_alloc_size == new_mi_size &&
- (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
- if (vp9_alloc_loop_filter(cm)) {
- vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate loop filter data");
- }
+ int new_mi_size = 0;
+ vp9_set_mb_mi(cm, cm->width, cm->height);
+ new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+ if (cm->mi_alloc_size < new_mi_size) {
+ vp9_free_context_buffers(cm);
+ vp9_free_pc_tree(&cpi->td);
+ vpx_free(cpi->mbmi_ext_base);
+ alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->initial_width = cpi->initial_height = 0;
+ cpi->external_resize = 0;
+ } else if (cm->mi_alloc_size == new_mi_size &&
+ (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
+ if (vp9_alloc_loop_filter(cm)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate loop filter data");
}
}
@@ -3472,7 +3472,6 @@ void vp9_scale_references(VP9_COMP *cpi) {
continue;
}
-#if CONFIG_VP9_HIGHBITDEPTH
if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
RefCntBuffer *new_fb_ptr = NULL;
int force_scaling = 0;
@@ -3485,6 +3484,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
new_fb_ptr = &pool->frame_bufs[new_fb];
if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
new_fb_ptr->buf.y_crop_height != cm->height) {
+#if CONFIG_VP9_HIGHBITDEPTH
if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
cm->use_highbitdepth,
@@ -3494,22 +3494,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
"Failed to allocate frame buffer");
scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
EIGHTTAP, 0);
- cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
- alloc_frame_mvs(cm, new_fb);
- }
#else
- if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
- RefCntBuffer *new_fb_ptr = NULL;
- int force_scaling = 0;
- int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
- if (new_fb == INVALID_IDX) {
- new_fb = get_free_fb(cm);
- force_scaling = 1;
- }
- if (new_fb == INVALID_IDX) return;
- new_fb_ptr = &pool->frame_bufs[new_fb];
- if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
- new_fb_ptr->buf.y_crop_height != cm->height) {
if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
VP9_ENC_BORDER_IN_PIXELS,
@@ -3517,10 +3502,10 @@ void vp9_scale_references(VP9_COMP *cpi) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0);
+#endif // CONFIG_VP9_HIGHBITDEPTH
cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
alloc_frame_mvs(cm, new_fb);
}
-#endif // CONFIG_VP9_HIGHBITDEPTH
} else {
int buf_idx;
RefCntBuffer *buf = NULL;
@@ -3958,6 +3943,35 @@ static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
#endif
}
+static YV12_BUFFER_CONFIG *svc_twostage_scale(
+ VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+ int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
+ if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+ cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->bit_depth == VPX_BITS_8) {
+ vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+ phase_scaler2);
+ vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
+ phase_scaler);
+ } else {
+ scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
+ filter_type2, phase_scaler2);
+ scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
+ filter_type, phase_scaler);
+ }
+#else
+ vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+ phase_scaler2);
+ vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return scaled;
+ } else {
+ return unscaled;
+ }
+}
+
static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
uint8_t *dest) {
VP9_COMMON *const cm = &cpi->common;
@@ -4000,7 +4014,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
// result will be saved in scaled_temp and might be used later.
const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1];
const int phase_scaler2 = svc->downsample_filter_phase[1];
- cpi->Source = vp9_svc_twostage_scale(
+ cpi->Source = svc_twostage_scale(
cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
svc->scaled_one_half = 1;
@@ -4486,21 +4500,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
// external rate control model.
// This flag doesn't have any impact when external rate control is not used.
int ext_rc_recode = 0;
- // Maximal frame size allowed by the external rate control.
- // case: 0, we ignore the max frame size limit, and encode with the qindex
- // passed in by the external rate control model.
- // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex
- // and may recode if undershoot/overshoot is seen.
- // If the external qindex is not VPX_DEFAULT_Q, we force no recode.
- // case: -1, we take libvpx's decision for the max frame size, as well as
- // the recode decision.
- // Otherwise: if a specific size is given, libvpx's recode decision
- // will respect the given size.
- int ext_rc_max_frame_size = 0;
- // Use VP9's decision of qindex. This flag is in use only in external rate
- // control model to help determine whether to recode when
- // |ext_rc_max_frame_size| is 0.
- int ext_rc_use_default_q = 1;
const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
#if CONFIG_RATE_CTRL
@@ -4616,27 +4615,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
}
#endif // CONFIG_RATE_CTRL
if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
+ !cpi->tpl_with_external_rc &&
(cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
vpx_codec_err_t codec_status;
const GF_GROUP *gf_group = &cpi->twopass.gf_group;
vpx_rc_encodeframe_decision_t encode_frame_decision;
- FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
- const int ref_frame_flags = get_ref_frame_flags(cpi);
- RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
- const RefCntBuffer *curr_frame_buf =
- get_ref_cnt_buffer(cm, cm->new_fb_idx);
- // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
- // index 1 refers to the first encoding frame in a gf group.
- // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
- // See function define_gf_group_structure().
- const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
- get_ref_frame_bufs(cpi, ref_frame_bufs);
codec_status = vp9_extrc_get_encodeframe_decision(
- &cpi->ext_ratectrl, curr_frame_buf->frame_index,
- cm->current_frame_coding_index, gf_group->index, update_type,
- gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
- &encode_frame_decision);
+ &cpi->ext_ratectrl, gf_group->index, &encode_frame_decision);
if (codec_status != VPX_CODEC_OK) {
vpx_internal_error(&cm->error, codec_status,
"vp9_extrc_get_encodeframe_decision() failed");
@@ -4645,9 +4631,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
// libvpx's default q.
if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
q = encode_frame_decision.q_index;
- ext_rc_use_default_q = 0;
}
- ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
}
vp9_set_quantizer(cpi, q);
@@ -4690,21 +4674,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
if (cpi->ext_ratectrl.ready &&
(cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
- // In general, for the external rate control, we take the qindex provided
- // as input and encode the frame with this qindex faithfully. However,
- // in some extreme scenarios, the provided qindex leads to a massive
- // overshoot of frame size. In this case, we fall back to VP9's decision
- // to pick a new qindex and recode the frame. We return the new qindex
- // through the API to the external model.
- if (ext_rc_max_frame_size == 0) {
- if (!ext_rc_use_default_q) break;
- } else if (ext_rc_max_frame_size == -1) {
- // Do nothing, fall back to libvpx's recode decision.
- } else {
- // Change the max frame size, used in libvpx's recode decision.
- rc->max_frame_bandwidth = ext_rc_max_frame_size;
- }
- ext_rc_recode = 1;
+ break;
}
#if CONFIG_RATE_CTRL
if (cpi->oxcf.use_simple_encode_api) {
@@ -4974,35 +4944,6 @@ static void set_ext_overrides(VP9_COMP *cpi) {
}
}
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
- VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
- YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
- int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
- if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
- cm->mi_rows * MI_SIZE != unscaled->y_height) {
-#if CONFIG_VP9_HIGHBITDEPTH
- if (cm->bit_depth == VPX_BITS_8) {
- vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
- phase_scaler2);
- vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
- phase_scaler);
- } else {
- scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
- filter_type2, phase_scaler2);
- scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
- filter_type, phase_scaler);
- }
-#else
- vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
- phase_scaler2);
- vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
-#endif // CONFIG_VP9_HIGHBITDEPTH
- return scaled;
- } else {
- return unscaled;
- }
-}
-
YV12_BUFFER_CONFIG *vp9_scale_if_required(
VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) {
@@ -6429,7 +6370,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
}
if (arf_src_index) {
- assert(arf_src_index <= rc->frames_to_key);
+ if (!(cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+ cpi->ext_ratectrl.funcs.get_gop_decision != NULL)) {
+ // This assert only makes sense when not using external RC.
+ assert(arf_src_index <= rc->frames_to_key);
+ }
if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
cpi->alt_ref_source = source;
@@ -6617,7 +6563,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
cpi->sf.enable_tpl_model) {
vp9_init_tpl_buffer(cpi);
- vp9_estimate_qp_gop(cpi);
+ vp9_estimate_tpl_qp_gop(cpi);
vp9_setup_tpl_stats(cpi);
}
#if CONFIG_COLLECT_COMPONENT_TIMING
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
index 91df538821..898855d10d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
@@ -25,6 +25,7 @@
#include "vpx_dsp/variance.h"
#include "vpx_dsp/psnr.h"
#include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_pthread.h"
#include "vpx_util/vpx_thread.h"
#include "vpx_util/vpx_timestamp.h"
@@ -1062,7 +1063,7 @@ typedef struct VP9_COMP {
*/
uint64_t frame_component_time[kTimingComponents];
#endif
- // Flag to indicate if QP and GOP for TPL is controlled by external RC.
+ // Flag to indicate if QP and GOP for TPL are controlled by external RC.
int tpl_with_external_rc;
} VP9_COMP;
@@ -1395,11 +1396,6 @@ void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst);
#endif // CONFIG_VP9_HIGHBITDEPTH
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
- VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
- YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
- int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2);
-
YV12_BUFFER_CONFIG *vp9_scale_if_required(
VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
index a8d1cb7a7a..c3b79507e6 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
@@ -17,6 +17,7 @@
#include "vp9/encoder/vp9_multi_thread.h"
#include "vp9/encoder/vp9_temporal_filter.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/vpx_pthread.h"
static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
int i, j, k, l, m, n;
@@ -55,7 +56,7 @@ static int enc_worker_hook(void *arg1, void *unused) {
vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
}
- return 0;
+ return 1;
}
static int get_max_tile_cols(VP9_COMP *cpi) {
@@ -106,6 +107,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
++cpi->num_workers;
winterface->init(worker);
+ worker->thread_name = "vpx enc worker";
if (i < num_workers - 1) {
thread_data->cpi = cpi;
@@ -204,8 +206,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
create_enc_workers(cpi, num_workers);
for (i = 0; i < num_workers; i++) {
- EncWorkerData *thread_data;
- thread_data = &cpi->tile_thr_data[i];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
// Before encoding a frame, copy the thread data from cpi.
if (thread_data->td != &cpi->td) {
@@ -456,7 +457,7 @@ static int first_pass_worker_hook(void *arg1, void *arg2) {
this_tile, &best_ref_mv, mb_row);
}
}
- return 0;
+ return 1;
}
void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
@@ -543,7 +544,7 @@ static int temporal_filter_worker_hook(void *arg1, void *arg2) {
mb_col_start, mb_col_end);
}
}
- return 0;
+ return 1;
}
void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
@@ -616,7 +617,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *arg2) {
vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row);
}
}
- return 0;
+ return 1;
}
void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
index 4c192da515..359cdd1290 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
@@ -11,13 +11,14 @@
#ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_
#define VPX_VP9_ENCODER_VP9_ETHREAD_H_
+#include "vpx_util/vpx_pthread.h"
+
#ifdef __cplusplus
extern "C" {
#endif
#define MAX_NUM_TILE_COLS (1 << 6)
#define MAX_NUM_TILE_ROWS 4
-#define MAX_NUM_THREADS 80
struct VP9_COMP;
struct ThreadData;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
index 4664e8c5e2..7b0d89acd2 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
@@ -156,32 +156,15 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
}
vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
- EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
- FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
- RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+ EXT_RATECTRL *ext_ratectrl, int gop_index,
vpx_rc_encodeframe_decision_t *encode_frame_decision) {
- if (ext_ratectrl == NULL) {
- return VPX_CODEC_INVALID_PARAM;
- }
- if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) {
- vpx_rc_status_t rc_status;
- vpx_rc_encodeframe_info_t encode_frame_info;
- encode_frame_info.show_index = show_index;
- encode_frame_info.coding_index = coding_index;
- encode_frame_info.gop_index = gop_index;
- encode_frame_info.frame_type = extrc_get_frame_type(update_type);
- encode_frame_info.gop_size = gop_size;
- encode_frame_info.use_alt_ref = use_alt_ref;
-
- vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
- encode_frame_info.ref_frame_coding_indexes,
- encode_frame_info.ref_frame_valid_list);
+ assert(ext_ratectrl != NULL);
+ assert(ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0);
- rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
- ext_ratectrl->model, &encode_frame_info, encode_frame_decision);
- if (rc_status == VPX_RC_ERROR) {
- return VPX_CODEC_ERROR;
- }
+ vpx_rc_status_t rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
+ ext_ratectrl->model, gop_index, encode_frame_decision);
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
}
return VPX_CODEC_OK;
}
@@ -222,29 +205,14 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
}
vpx_codec_err_t vp9_extrc_get_gop_decision(
- EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
- vpx_rc_gop_decision_t *gop_decision) {
+ EXT_RATECTRL *ext_ratectrl, vpx_rc_gop_decision_t *gop_decision) {
vpx_rc_status_t rc_status;
if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
(ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
return VPX_CODEC_INVALID_PARAM;
}
- rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
- gop_info, gop_decision);
- if (gop_decision->use_alt_ref) {
- const int arf_constraint =
- gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
- gop_decision->gop_coding_frames < gop_info->lag_in_frames;
- if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
- }
- // TODO(chengchen): Take min and max gf interval from the model
- // and overwrite libvpx's decision so that we can get rid
- // of one of the checks here.
- if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
- gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
- gop_info->max_gf_interval) {
- return VPX_CODEC_ERROR;
- }
+ rc_status =
+ ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, gop_decision);
if (rc_status == VPX_RC_ERROR) {
return VPX_CODEC_ERROR;
}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
index b04580c1d4..d1be5f2aef 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
@@ -39,9 +39,7 @@ vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
const VpxTplGopStats *tpl_gop_stats);
vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
- EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
- FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
- RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+ EXT_RATECTRL *ext_ratectrl, int gop_index,
vpx_rc_encodeframe_decision_t *encode_frame_decision);
vpx_codec_err_t vp9_extrc_update_encodeframe_result(
@@ -50,9 +48,8 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
uint32_t input_bit_depth, const int actual_encoding_qindex);
-vpx_codec_err_t vp9_extrc_get_gop_decision(
- EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
- vpx_rc_gop_decision_t *gop_decision);
+vpx_codec_err_t vp9_extrc_get_gop_decision(EXT_RATECTRL *ext_ratectrl,
+ vpx_rc_gop_decision_t *gop_decision);
vpx_codec_err_t vp9_extrc_get_frame_rdmult(
EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
index dcb62e8768..69261ac65f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
@@ -162,42 +162,3 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
et_uv, el_uv, eb_uv, er_uv, chroma_step);
}
-
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst, int srcy,
- int srcx, int srch, int srcw) {
- // If the side is not touching the bounder then don't extend.
- const int et_y = srcy ? 0 : dst->border;
- const int el_y = srcx ? 0 : dst->border;
- const int eb_y = srcy + srch != src->y_height
- ? 0
- : dst->border + dst->y_height - src->y_height;
- const int er_y = srcx + srcw != src->y_width
- ? 0
- : dst->border + dst->y_width - src->y_width;
- const int src_y_offset = srcy * src->y_stride + srcx;
- const int dst_y_offset = srcy * dst->y_stride + srcx;
-
- const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
- const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
- const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
- const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
- const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
- const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
- const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
- const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
- // detect nv12 colorspace
- const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
-
- copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
- dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
- et_y, el_y, eb_y, er_y, 1);
-
- copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
- dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
- srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step);
-
- copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
- dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
- srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step);
-}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
index 4ba7fc95e3..21d7e68b9f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
@@ -21,9 +21,6 @@ extern "C" {
void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst);
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
- YV12_BUFFER_CONFIG *dst, int srcy,
- int srcx, int srch, int srcw);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
index a9cdf5353f..58b9b7ba61 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
@@ -37,6 +37,7 @@
#include "vp9/encoder/vp9_mcomp.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rd.h"
+#include "vpx/vpx_ext_ratectrl.h"
#include "vpx_dsp/variance.h"
#define OUTPUT_FPF 0
@@ -1164,7 +1165,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
v_fn_ptr.vf = get_block_variance_fn(bsize);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8);
+ v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
this_motion_error =
@@ -2769,38 +2770,6 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
}
}
#endif
- // If the external rate control model for GOP is used, the gop decisions
- // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
- // will be overwritten.
- if (cpi->ext_ratectrl.ready &&
- (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
- cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) {
- vpx_codec_err_t codec_status;
- vpx_rc_gop_decision_t gop_decision;
- vpx_rc_gop_info_t gop_info;
- gop_info.min_gf_interval = rc->min_gf_interval;
- gop_info.max_gf_interval = rc->max_gf_interval;
- gop_info.active_min_gf_interval = active_gf_interval.min;
- gop_info.active_max_gf_interval = active_gf_interval.max;
- gop_info.allow_alt_ref = allow_alt_ref;
- gop_info.is_key_frame = is_key_frame;
- gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active;
- gop_info.frames_since_key = rc->frames_since_key;
- gop_info.frames_to_key = rc->frames_to_key;
- gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
- gop_info.show_index = cm->current_video_frame;
- gop_info.coding_index = cm->current_frame_coding_index;
- gop_info.gop_global_index = rc->gop_global_index;
-
- codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
- &gop_decision);
- if (codec_status != VPX_CODEC_OK) {
- vpx_internal_error(&cm->error, codec_status,
- "vp9_extrc_get_gop_decision() failed");
- }
- gop_coding_frames = gop_decision.gop_coding_frames;
- use_alt_ref = gop_decision.use_alt_ref;
- }
// Was the group length constrained by the requirement for a new KF?
rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0;
@@ -3600,32 +3569,71 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
else
twopass->fr_content_type = FC_NORMAL;
- // Keyframe and section processing.
- if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
- // Define next KF group and assign bits to it.
- find_next_key_frame(cpi, show_idx);
+ // If the external rate control model for GOP is used, the gop decisions
+ // are overwritten, including whether to use key frame in this GF group,
+ // GF group length, and whether to use arf.
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+ cpi->ext_ratectrl.funcs.get_gop_decision != NULL &&
+ rc->frames_till_gf_update_due == 0) {
+ vpx_codec_err_t codec_status;
+ vpx_rc_gop_decision_t gop_decision;
+ codec_status =
+ vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_decision);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_get_gop_decision() failed");
+ }
+ if (gop_decision.use_key_frame) {
+ cpi->common.frame_type = KEY_FRAME;
+ rc->frames_since_key = 0;
+ // Clear the alt ref active flag and last group multi arf flags as they
+ // can never be set for a key frame.
+ rc->source_alt_ref_active = 0;
+ // KF is always a GF so clear frames till next gf counter.
+ rc->frames_till_gf_update_due = 0;
+ }
+
+ // A new GF group
+ if (rc->frames_till_gf_update_due == 0) {
+ vp9_zero(twopass->gf_group);
+ ++rc->gop_global_index;
+ if (gop_decision.use_alt_ref) {
+ rc->source_alt_ref_pending = 1;
+ }
+ rc->baseline_gf_interval =
+ gop_decision.gop_coding_frames - rc->source_alt_ref_pending;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ define_gf_group_structure(cpi);
+ }
} else {
- cm->frame_type = INTER_FRAME;
- }
+ // Keyframe and section processing.
+ if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+ // Define next KF group and assign bits to it.
+ find_next_key_frame(cpi, show_idx);
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
- // Define a new GF/ARF group. (Should always enter here for key frames).
- if (rc->frames_till_gf_update_due == 0) {
- define_gf_group(cpi, show_idx);
+ // Define a new GF/ARF group. (Should always enter here for key frames).
+ if (rc->frames_till_gf_update_due == 0) {
+ define_gf_group(cpi, show_idx);
- rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
#if ARF_STATS_OUTPUT
- {
- FILE *fpfile;
- fpfile = fopen("arf.stt", "a");
- ++arf_count;
- fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
- cm->current_video_frame, rc->frames_till_gf_update_due,
- rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type);
-
- fclose(fpfile);
- }
+ {
+ FILE *fpfile;
+ fpfile = fopen("arf.stt", "a");
+ ++arf_count;
+ fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
+ cm->current_video_frame, rc->frames_till_gf_update_due,
+ rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type);
+
+ fclose(fpfile);
+ }
#endif
+ }
}
vp9_configure_buffer_updates(cpi, gf_group->index);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
index 97838c38e6..b6be4f88ac 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
@@ -9,6 +9,7 @@
*/
#include <assert.h>
#include <stdlib.h>
+#include <string.h>
#include "./vpx_config.h"
@@ -81,7 +82,6 @@ bail:
return NULL;
}
-#define USE_PARTIAL_COPY 0
int vp9_lookahead_full(const struct lookahead_ctx *ctx) {
return ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz;
}
@@ -94,11 +94,6 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
int64_t ts_start, int64_t ts_end, int use_highbitdepth,
vpx_enc_frame_flags_t flags) {
struct lookahead_entry *buf;
-#if USE_PARTIAL_COPY
- int row, col, active_end;
- int mb_rows = (src->y_height + 15) >> 4;
- int mb_cols = (src->y_width + 15) >> 4;
-#endif
int width = src->y_crop_width;
int height = src->y_crop_height;
int uv_width = src->uv_crop_width;
@@ -119,76 +114,36 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
height != buf->img.y_crop_height ||
uv_width != buf->img.uv_crop_width ||
uv_height != buf->img.uv_crop_height;
- larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
- uv_width > buf->img.uv_width ||
- uv_height > buf->img.uv_height;
+ larger_dimensions =
+ width > buf->img.y_crop_width || height > buf->img.y_crop_height ||
+ uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height;
assert(!larger_dimensions || new_dimensions);
-#if USE_PARTIAL_COPY
- // TODO(jkoleszar): This is disabled for now, as
- // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
-
- // Only do this partial copy if the following conditions are all met:
- // 1. Lookahead queue has has size of 1.
- // 2. Active map is provided.
- // 3. This is not a key frame, golden nor altref frame.
- if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
- for (row = 0; row < mb_rows; ++row) {
- col = 0;
-
- while (1) {
- // Find the first active macroblock in this row.
- for (; col < mb_cols; ++col) {
- if (active_map[col]) break;
- }
-
- // No more active macroblock in this row.
- if (col == mb_cols) break;
-
- // Find the end of active region in this row.
- active_end = col;
-
- for (; active_end < mb_cols; ++active_end) {
- if (!active_map[active_end]) break;
- }
-
- // Only copy this active region.
- vp9_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
- 16, (active_end - col) << 4);
-
- // Start again from the end of this active region.
- col = active_end;
- }
-
- active_map += mb_cols;
- }
- } else {
-#endif
- if (larger_dimensions) {
- YV12_BUFFER_CONFIG new_img;
- memset(&new_img, 0, sizeof(new_img));
- if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
- subsampling_y,
+ if (larger_dimensions) {
+ YV12_BUFFER_CONFIG new_img;
+ memset(&new_img, 0, sizeof(new_img));
+ if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+ subsampling_y,
#if CONFIG_VP9_HIGHBITDEPTH
- use_highbitdepth,
+ use_highbitdepth,
#endif
- VP9_ENC_BORDER_IN_PIXELS, 0))
- return 1;
- vpx_free_frame_buffer(&buf->img);
- buf->img = new_img;
- } else if (new_dimensions) {
- buf->img.y_crop_width = src->y_crop_width;
- buf->img.y_crop_height = src->y_crop_height;
- buf->img.uv_crop_width = src->uv_crop_width;
- buf->img.uv_crop_height = src->uv_crop_height;
- buf->img.subsampling_x = src->subsampling_x;
- buf->img.subsampling_y = src->subsampling_y;
- }
- // Partial copy not implemented yet
- vp9_copy_and_extend_frame(src, &buf->img);
-#if USE_PARTIAL_COPY
+ VP9_ENC_BORDER_IN_PIXELS, 0))
+ return 1;
+ vpx_free_frame_buffer(&buf->img);
+ buf->img = new_img;
+ } else if (new_dimensions) {
+ buf->img.y_width = src->y_width;
+ buf->img.y_height = src->y_height;
+ buf->img.uv_width = src->uv_width;
+ buf->img.uv_height = src->uv_height;
+ buf->img.y_crop_width = src->y_crop_width;
+ buf->img.y_crop_height = src->y_crop_height;
+ buf->img.uv_crop_width = src->uv_crop_width;
+ buf->img.uv_crop_height = src->uv_crop_height;
+ buf->img.subsampling_x = src->subsampling_x;
+ buf->img.subsampling_y = src->subsampling_y;
}
-#endif
+ vp9_copy_and_extend_frame(src, &buf->img);
buf->ts_start = ts_start;
buf->ts_end = ts_end;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
index 0843cd97e4..6e124f9944 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
@@ -10,6 +10,7 @@
#include <assert.h>
+#include "vpx_util/vpx_pthread.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_multi_thread.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
index 3f4fe6957b..d37e020b0a 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
@@ -12,6 +12,7 @@
#include <math.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/bitops.h"
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_quant_common.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
index 62d6b93028..76d5435e60 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -35,6 +35,7 @@
#include "vp9/encoder/vp9_ext_ratectrl.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
#include "vpx/vpx_codec.h"
#include "vpx/vpx_ext_ratectrl.h"
@@ -1433,8 +1434,8 @@ static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index,
return q;
}
-static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
- int *top_index, int gf_group_index) {
+int vp9_rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
+ int *top_index, int gf_group_index) {
const VP9_COMMON *const cm = &cpi->common;
const RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1581,7 +1582,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
q = active_worst_quality;
}
}
- clamp(q, active_best_quality, active_worst_quality);
*top_index = active_worst_quality;
*bottom_index = active_best_quality;
@@ -1603,8 +1603,8 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index,
else
q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
} else {
- q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
- gf_group_index);
+ q = vp9_rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
+ gf_group_index);
}
if (cpi->sf.use_nonrd_pick_mode) {
if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex;
@@ -1675,63 +1675,6 @@ void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) {
}
}
-void vp9_estimate_qp_gop(VP9_COMP *cpi) {
- int gop_length = cpi->twopass.gf_group.gf_group_size;
- int bottom_index, top_index;
- int idx;
- const int gf_index = cpi->twopass.gf_group.index;
- const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
- const int refresh_frame_context = cpi->common.refresh_frame_context;
-
- for (idx = 1; idx <= gop_length; ++idx) {
- TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
- int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
- cpi->twopass.gf_group.index = idx;
- vp9_rc_set_frame_target(cpi, target_rate);
- vp9_configure_buffer_updates(cpi, idx);
- if (cpi->tpl_with_external_rc) {
- if (cpi->ext_ratectrl.ready &&
- (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
- cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
- VP9_COMMON *cm = &cpi->common;
- vpx_codec_err_t codec_status;
- const GF_GROUP *gf_group = &cpi->twopass.gf_group;
- vpx_rc_encodeframe_decision_t encode_frame_decision;
- FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
- RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
- const RefCntBuffer *curr_frame_buf =
- get_ref_cnt_buffer(cm, cm->new_fb_idx);
- // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
- // index 1 refers to the first encoding frame in a gf group.
- // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
- // See function define_gf_group_structure().
- const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
- const int frame_coding_index = cm->current_frame_coding_index + idx - 1;
- get_ref_frame_bufs(cpi, ref_frame_bufs);
- codec_status = vp9_extrc_get_encodeframe_decision(
- &cpi->ext_ratectrl, curr_frame_buf->frame_index, frame_coding_index,
- gf_group->index, update_type, gf_group->gf_group_size, use_alt_ref,
- ref_frame_bufs, 0 /*ref_frame_flags is not used*/,
- &encode_frame_decision);
- if (codec_status != VPX_CODEC_OK) {
- vpx_internal_error(&cm->error, codec_status,
- "vp9_extrc_get_encodeframe_decision() failed");
- }
- tpl_frame->base_qindex = encode_frame_decision.q_index;
- }
- } else {
- tpl_frame->base_qindex =
- rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx);
- tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
- }
- }
- // Reset the actual index and frame update
- cpi->twopass.gf_group.index = gf_index;
- cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
- cpi->common.refresh_frame_context = refresh_frame_context;
- vp9_configure_buffer_updates(cpi, gf_index);
-}
-
void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target,
int *frame_under_shoot_limit,
int *frame_over_shoot_limit) {
@@ -3361,14 +3304,20 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
}
// For temporal layers, reset the rate control parametes across all
- // temporal layers. If the first_spatial_layer_to_encode > 0, then this
- // superframe has skipped lower base layers. So in this case we should also
- // reset and force max-q for spatial layers < first_spatial_layer_to_encode.
+ // temporal layers.
+ // If the first_spatial_layer_to_encode > 0, then this superframe has
+ // skipped lower base layers. So in this case we should also reset and
+ // force max-q for spatial layers < first_spatial_layer_to_encode.
+ // For the case of no inter-layer prediction on delta frames: reset and
+ // force max-q for all spatial layers, to avoid excessive frame drops.
if (cpi->use_svc) {
int tl = 0;
int sl = 0;
SVC *svc = &cpi->svc;
- for (sl = 0; sl < VPXMAX(1, svc->first_spatial_layer_to_encode); ++sl) {
+ int num_spatial_layers = VPXMAX(1, svc->first_spatial_layer_to_encode);
+ if (svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON)
+ num_spatial_layers = svc->number_spatial_layers;
+ for (sl = 0; sl < num_spatial_layers; ++sl) {
for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
const int layer =
LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
index 48c49e937e..0c61ad3461 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -346,12 +346,14 @@ int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q);
void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index);
-void vp9_estimate_qp_gop(struct VP9_COMP *cpi);
-
void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi);
void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi);
+int vp9_rc_pick_q_and_bounds_two_pass(const struct VP9_COMP *cpi,
+ int *bottom_index, int *top_index,
+ int gf_group_index);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
index 974e43c90f..447136ed84 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
@@ -1834,7 +1834,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
return 1;
}
-static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) {
+static INLINE int skip_iters(int_mv iter_mvs[][2], int ite, int id) {
if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) {
int_mv cur_fullpel_mv, prev_fullpel_mv;
cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
index b8910370e0..048ab8732d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
@@ -18,9 +18,12 @@
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_scan.h"
#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_tpl_model.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
const GF_GROUP *gf_group, int *tpl_group_frames) {
@@ -407,8 +410,12 @@ static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
tpl_block_stats_ptr->col = mi_col * 8;
tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
- tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
- tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+ // inter/intra_cost here is calculated with SATD which should be close
+ // enough to be used as inter/intra_pred_error
+ tpl_block_stats_ptr->inter_pred_err = src_stats->inter_cost;
+ tpl_block_stats_ptr->intra_pred_err = src_stats->intra_cost;
+ tpl_block_stats_ptr->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_block_stats_ptr->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row;
tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col;
tpl_block_stats_ptr->ref_frame_index = ref_frame_idx;
@@ -721,7 +728,9 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
tpl_stats->intra_cost = VPXMAX(
1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
- tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+ if (best_rf_idx >= 0) {
+ tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+ }
tpl_stats->mv.as_int = best_mv.as_int;
*ref_frame_idx = best_rf_idx;
}
@@ -1489,6 +1498,53 @@ static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
}
#endif // CONFIG_RATE_CTRL
+void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi) {
+ int gop_length = cpi->twopass.gf_group.gf_group_size;
+ int bottom_index, top_index;
+ int idx;
+ const int gf_index = cpi->twopass.gf_group.index;
+ const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
+ const int refresh_frame_context = cpi->common.refresh_frame_context;
+
+ for (idx = 1; idx <= gop_length; ++idx) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
+ int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
+ cpi->twopass.gf_group.index = idx;
+ vp9_rc_set_frame_target(cpi, target_rate);
+ vp9_configure_buffer_updates(cpi, idx);
+ if (cpi->tpl_with_external_rc) {
+ VP9_COMMON *cm = &cpi->common;
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+ cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
+ vpx_codec_err_t codec_status;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ vpx_rc_encodeframe_decision_t encode_frame_decision;
+ codec_status = vp9_extrc_get_encodeframe_decision(
+ &cpi->ext_ratectrl, gf_group->index - 1, &encode_frame_decision);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_get_encodeframe_decision() failed");
+ }
+ tpl_frame->base_qindex = encode_frame_decision.q_index;
+ } else {
+ vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+ "The external rate control library is not set "
+ "properly for TPL pass.");
+ }
+ } else {
+ tpl_frame->base_qindex = vp9_rc_pick_q_and_bounds_two_pass(
+ cpi, &bottom_index, &top_index, idx);
+ tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
+ }
+ }
+ // Reset the actual index and frame update
+ cpi->twopass.gf_group.index = gf_index;
+ cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
+ cpi->common.refresh_frame_context = refresh_frame_context;
+ vp9_configure_buffer_updates(cpi, gf_index);
+}
+
void vp9_setup_tpl_stats(VP9_COMP *cpi) {
GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
const GF_GROUP *gf_group = &cpi->twopass.gf_group;
@@ -1512,12 +1568,16 @@ void vp9_setup_tpl_stats(VP9_COMP *cpi) {
mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
}
- // TPL stats has extra frames from next GOP. Trim those extra frames for
- // Qmode.
- trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count);
-
if (cpi->ext_ratectrl.ready &&
cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) {
+ // Intra search on key frame
+ if (gf_picture[0].update_type == KF_UPDATE) {
+ mc_flow_dispenser(cpi, gf_picture, 0, cpi->tpl_bsize);
+ }
+ // TPL stats has extra frames from next GOP. Trim those extra frames for
+ // Qmode.
+ trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats,
+ extended_frame_count);
const vpx_codec_err_t codec_status =
vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats);
if (codec_status != VPX_CODEC_OK) {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
index 04beb22610..de0ac39a1f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
@@ -31,6 +31,7 @@ typedef struct GF_PICTURE {
void vp9_init_tpl_buffer(VP9_COMP *cpi);
void vp9_setup_tpl_stats(VP9_COMP *cpi);
void vp9_free_tpl_buffer(VP9_COMP *cpi);
+void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi);
void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
TX_SIZE tx_size);
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 94506aad0f..628dc4fead 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -886,14 +886,14 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
scale_plane_1_to_2_phase_0(
src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
- scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
- dst->uv_stride, src_w / 2, src_h / 2,
- vp9_filter_kernels[filter_type][8],
- temp_buffer);
- scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
- dst->uv_stride, src_w / 2, src_h / 2,
- vp9_filter_kernels[filter_type][8],
- temp_buffer);
+ const int src_uv_w = src->uv_crop_width;
+ const int src_uv_h = src->uv_crop_height;
+ scale_plane_1_to_2_phase_0(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+ scale_plane_1_to_2_phase_0(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer);
free(temp_buffer);
}
}
diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
index fd81bce7b5..942c15ce49 100644
--- a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
@@ -12,10 +12,12 @@
#include <new>
#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_picklpf.h"
#include "vpx/vp8cx.h"
#include "vpx/vpx_codec.h"
+#include "vpx_mem/vpx_mem.h"
namespace libvpx {
diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.h b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
index 85005c5474..4c39255886 100644
--- a/media/libvpx/libvpx/vp9/ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
@@ -12,43 +12,34 @@
#define VPX_VP9_RATECTRL_RTC_H_
#include <cstdint>
+#include <cstring>
+#include <limits>
#include <memory>
-#include "vp9/common/vp9_enums.h"
-#include "vp9/vp9_iface_common.h"
-#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
-#include "vp9/vp9_cx_iface.h"
+#include "vpx/vpx_encoder.h"
#include "vpx/internal/vpx_ratectrl_rtc.h"
-#include "vpx_mem/vpx_mem.h"
struct VP9_COMP;
namespace libvpx {
struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
- public:
VP9RateControlRtcConfig() {
- ss_number_layers = 1;
- vp9_zero(max_quantizers);
- vp9_zero(min_quantizers);
- vp9_zero(scaling_factor_den);
- vp9_zero(scaling_factor_num);
- vp9_zero(layer_target_bitrate);
- vp9_zero(ts_rate_decimator);
+ memset(layer_target_bitrate, 0, sizeof(layer_target_bitrate));
+ memset(ts_rate_decimator, 0, sizeof(ts_rate_decimator));
scaling_factor_num[0] = 1;
scaling_factor_den[0] = 1;
max_quantizers[0] = max_quantizer;
min_quantizers[0] = min_quantizer;
- max_consec_drop = INT_MAX;
}
// Number of spatial layers
- int ss_number_layers;
- int max_quantizers[VPX_MAX_LAYERS];
- int min_quantizers[VPX_MAX_LAYERS];
- int scaling_factor_num[VPX_SS_MAX_LAYERS];
- int scaling_factor_den[VPX_SS_MAX_LAYERS];
+ int ss_number_layers = 1;
+ int max_quantizers[VPX_MAX_LAYERS] = {};
+ int min_quantizers[VPX_MAX_LAYERS] = {};
+ int scaling_factor_num[VPX_SS_MAX_LAYERS] = {};
+ int scaling_factor_den[VPX_SS_MAX_LAYERS] = {};
// This is only for SVC for now.
- int max_consec_drop;
+ int max_consec_drop = std::numeric_limits<int>::max();
};
struct VP9FrameParamsQpRTC {
@@ -105,9 +96,9 @@ class VP9RateControlRTC {
const VP9FrameParamsQpRTC &frame_params);
private:
- VP9RateControlRTC() {}
+ VP9RateControlRTC() = default;
bool InitRateControl(const VP9RateControlRtcConfig &cfg);
- struct VP9_COMP *cpi_;
+ struct VP9_COMP *cpi_ = nullptr;
};
} // namespace libvpx
diff --git a/media/libvpx/libvpx/vp9/simple_encode.cc b/media/libvpx/libvpx/vp9/simple_encode.cc
index 2e6f9a4513..5e565d1b1a 100644
--- a/media/libvpx/libvpx/vp9/simple_encode.cc
+++ b/media/libvpx/libvpx/vp9/simple_encode.cc
@@ -8,8 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <stdio.h>
+#include <stdlib.h>
+
#include <memory>
#include <vector>
+
#include "./ivfenc.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_enums.h"
@@ -888,6 +892,10 @@ void SimpleEncode::ComputeFirstPassStats() {
use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth;
#endif
vpx_image_t img;
+ if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) {
+ fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n");
+ abort();
+ }
vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1);
rewind(in_file_);
impl_ptr_->first_pass_stats.clear();
@@ -1053,6 +1061,10 @@ void SimpleEncode::StartEncode() {
vp9_set_first_pass_stats(&oxcf, &stats);
assert(impl_ptr_->cpi == nullptr);
impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
+ if (impl_ptr_->img_fmt == VPX_IMG_FMT_NV12) {
+ fprintf(stderr, "VPX_IMG_FMT_NV12 is not supported\n");
+ abort();
+ }
vpx_img_alloc(&impl_ptr_->tmp_img, impl_ptr_->img_fmt, frame_width_,
frame_height_, 1);
diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
index 8df04f29f0..fe62bac5f2 100644
--- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <limits.h>
+#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@@ -17,6 +19,7 @@
#include "vpx_dsp/psnr.h"
#include "vpx_ports/static_assert.h"
#include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_thread.h"
#include "vpx_util/vpx_timestamp.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "./vpx_version.h"
@@ -110,7 +113,6 @@ struct vpx_codec_alg_priv {
vpx_codec_priv_t base;
vpx_codec_enc_cfg_t cfg;
struct vp9_extracfg extra_cfg;
- vpx_rational64_t timestamp_ratio;
vpx_codec_pts_t pts_offset;
unsigned char pts_offset_initialized;
VP9EncoderConfig oxcf;
@@ -190,7 +192,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2);
RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1);
RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
- RANGE_CHECK_HI(cfg, g_threads, 64);
+ RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q);
RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
@@ -1140,10 +1142,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
if (res == VPX_CODEC_OK) {
priv->pts_offset_initialized = 0;
- // TODO(angiebird): Replace priv->timestamp_ratio by
- // oxcf->g_timebase_in_ts
- priv->timestamp_ratio = get_g_timebase_in_ts(priv->cfg.g_timebase);
-
set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
#if CONFIG_VP9_HIGHBITDEPTH
priv->oxcf.use_highbitdepth =
@@ -1166,9 +1164,9 @@ static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
return VPX_CODEC_OK;
}
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
- unsigned long duration,
- vpx_enc_deadline_t deadline) {
+static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+ unsigned long duration,
+ vpx_enc_deadline_t deadline) {
MODE new_mode = BEST;
#if CONFIG_REALTIME_ONLY
@@ -1179,13 +1177,16 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
case VPX_RC_ONE_PASS:
if (deadline > 0) {
// Convert duration parameter from stream timebase to microseconds.
- uint64_t duration_us;
-
VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 &&
(TICKS_PER_SEC % 1000000) == 0);
- duration_us = duration * (uint64_t)ctx->timestamp_ratio.num /
- (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
+ if (duration > UINT64_MAX / (uint64_t)ctx->oxcf.g_timebase_in_ts.num) {
+ ERROR("duration is too big");
+ }
+ uint64_t duration_us = duration *
+ (uint64_t)ctx->oxcf.g_timebase_in_ts.num /
+ ((uint64_t)ctx->oxcf.g_timebase_in_ts.den *
+ (TICKS_PER_SEC / 1000000));
// If the deadline is more that the duration this frame is to be shown,
// use good quality mode. Otherwise use realtime mode.
@@ -1208,6 +1209,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
ctx->oxcf.mode = new_mode;
vp9_change_config(ctx->cpi, &ctx->oxcf);
}
+ return VPX_CODEC_OK;
}
// Turn on to test if supplemental superframe data breaks decoding
@@ -1281,6 +1283,10 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
.is_key_frame))
flags |= VPX_FRAME_IS_KEY;
+ if (!cpi->common.show_frame) {
+ flags |= VPX_FRAME_IS_INVISIBLE;
+ }
+
if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE;
return flags;
@@ -1318,7 +1324,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
volatile vpx_enc_frame_flags_t flags = enc_flags;
volatile vpx_codec_pts_t pts = pts_val;
VP9_COMP *const cpi = ctx->cpi;
- const vpx_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio;
+ const vpx_rational64_t *const timebase_in_ts = &ctx->oxcf.g_timebase_in_ts;
size_t data_sz;
vpx_codec_cx_pkt_t pkt;
memset(&pkt, 0, sizeof(pkt));
@@ -1347,13 +1353,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
}
}
- if (!ctx->pts_offset_initialized) {
- ctx->pts_offset = pts;
- ctx->pts_offset_initialized = 1;
+ res = pick_quickcompress_mode(ctx, duration, deadline);
+ if (res != VPX_CODEC_OK) {
+ return res;
}
- pts -= ctx->pts_offset;
-
- pick_quickcompress_mode(ctx, duration, deadline);
vpx_codec_pkt_list_init(&ctx->pkt_list);
// Handle Flags
@@ -1384,20 +1387,53 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
if (res == VPX_CODEC_OK) {
unsigned int lib_flags = 0;
- YV12_BUFFER_CONFIG sd;
- int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts);
size_t size, cx_data_sz;
unsigned char *cx_data;
- cpi->svc.timebase_fac = timebase_units_to_ticks(timestamp_ratio, 1);
- cpi->svc.time_stamp_superframe = dst_time_stamp;
-
// Set up internal flags
if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
if (img != NULL) {
+ YV12_BUFFER_CONFIG sd;
+
+ if (!ctx->pts_offset_initialized) {
+ ctx->pts_offset = pts;
+ ctx->pts_offset_initialized = 1;
+ }
+ if (pts < ctx->pts_offset) {
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "pts is smaller than initial pts");
+ }
+ pts -= ctx->pts_offset;
+ if (pts > INT64_MAX / timebase_in_ts->num) {
+ vpx_internal_error(
+ &cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "conversion of relative pts to ticks would overflow");
+ }
+ const int64_t dst_time_stamp =
+ timebase_units_to_ticks(timebase_in_ts, pts);
+
+ cpi->svc.timebase_fac = timebase_units_to_ticks(timebase_in_ts, 1);
+ cpi->svc.time_stamp_superframe = dst_time_stamp;
+
+#if ULONG_MAX > INT64_MAX
+ if (duration > INT64_MAX) {
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "duration is too big");
+ }
+#endif
+ if (pts > INT64_MAX - (int64_t)duration) {
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "relative pts + duration is too big");
+ }
+ vpx_codec_pts_t pts_end = pts + (int64_t)duration;
+ if (pts_end > INT64_MAX / timebase_in_ts->num) {
+ vpx_internal_error(
+ &cpi->common.error, VPX_CODEC_INVALID_PARAM,
+ "conversion of relative pts + duration to ticks would overflow");
+ }
const int64_t dst_end_time_stamp =
- timebase_units_to_ticks(timestamp_ratio, pts + duration);
+ timebase_units_to_ticks(timebase_in_ts, pts_end);
res = image2yuvconfig(img, &sd);
if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
@@ -1434,7 +1470,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
if (cx_data_sz < ctx->cx_data_sz / 2) {
vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR,
"Compressed data buffer too small");
- return VPX_CODEC_ERROR;
}
}
@@ -1443,6 +1478,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
// compute first pass stats
if (img) {
int ret;
+ int64_t dst_time_stamp;
int64_t dst_end_time_stamp;
vpx_codec_cx_pkt_t fps_pkt;
ENCODE_FRAME_RESULT encode_frame_result;
@@ -1469,6 +1505,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
#endif // !CONFIG_REALTIME_ONLY
} else {
ENCODE_FRAME_RESULT encode_frame_result;
+ int64_t dst_time_stamp;
int64_t dst_end_time_stamp;
vp9_init_encode_frame_result(&encode_frame_result);
while (cx_data_sz >= ctx->cx_data_sz / 2 &&
@@ -1507,10 +1544,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
if (ctx->output_cx_pkt_cb.output_cx_pkt) {
pkt.kind = VPX_CODEC_CX_FRAME_PKT;
pkt.data.frame.pts =
- ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+ ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) +
ctx->pts_offset;
pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
- timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+ timebase_in_ts, dst_end_time_stamp - dst_time_stamp);
pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
pkt.data.frame.buf = ctx->pending_cx_data;
pkt.data.frame.sz = size;
@@ -1527,10 +1564,10 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
// Add the frame packet to the list of returned packets.
pkt.kind = VPX_CODEC_CX_FRAME_PKT;
pkt.data.frame.pts =
- ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+ ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) +
ctx->pts_offset;
pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
- timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+ timebase_in_ts, dst_end_time_stamp - dst_time_stamp);
pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
@@ -1979,6 +2016,7 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
+ ratectrl_config.base_qp = oxcf->cq_level;
if (oxcf->rc_mode == VPX_VBR) {
ratectrl_config.rc_mode = VPX_RC_VBR;
@@ -2223,7 +2261,7 @@ static vpx_codec_enc_cfg_t get_enc_cfg(int frame_width, int frame_height,
return enc_cfg;
}
-static vp9_extracfg get_extra_cfg() {
+static vp9_extracfg get_extra_cfg(void) {
vp9_extracfg extra_cfg = default_extra_cfg;
return extra_cfg;
}
diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/libvpx/vp9/vp9_dx_iface.c
index 860f721dc5..7567910b9b 100644
--- a/media/libvpx/libvpx/vp9/vp9_dx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.c
@@ -19,7 +19,6 @@
#include "vpx/vpx_decoder.h"
#include "vpx_dsp/bitreader_buffer.h"
#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_util/vpx_thread.h"
#include "vp9/common/vp9_alloccommon.h"
#include "vp9/common/vp9_frame_buffers.h"
diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk
index 44790ef6a4..7a0e2d8d1f 100644
--- a/media/libvpx/libvpx/vp9/vp9cx.mk
+++ b/media/libvpx/libvpx/vp9/vp9cx.mk
@@ -140,6 +140,7 @@ endif
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
+VP9_CX_SRCS-$(HAVE_SVE) += encoder/arm/neon/vp9_error_sve.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
diff --git a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
index 01d64b14b7..2643b5578a 100644
--- a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
@@ -22,8 +22,14 @@ enum class FrameDropDecision {
kDrop, // Frame is dropped.
};
+struct UVDeltaQP {
+ // For the UV channel: the QP for the dc/ac value is given as
+ // GetQP() + uvdc/ac_delta_q, where the uvdc/ac_delta_q are negative numbers.
+ int uvdc_delta_q;
+ int uvac_delta_q;
+};
+
struct VpxRateControlRtcConfig {
- public:
VpxRateControlRtcConfig() {
width = 1280;
height = 720;
diff --git a/media/libvpx/libvpx/vpx/src/vpx_encoder.c b/media/libvpx/libvpx/vpx/src/vpx_encoder.c
index 017525aeee..001d854abe 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_encoder.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_encoder.c
@@ -14,6 +14,7 @@
*/
#include <assert.h>
#include <limits.h>
+#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vp8/common/blockd.h"
@@ -184,8 +185,8 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
while (0)
#else
-static void FLOATING_POINT_INIT() {}
-static void FLOATING_POINT_RESTORE() {}
+static void FLOATING_POINT_INIT(void) {}
+static void FLOATING_POINT_RESTORE(void) {}
#endif
vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
@@ -200,6 +201,10 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
res = VPX_CODEC_ERROR;
else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
res = VPX_CODEC_INCAPABLE;
+#if ULONG_MAX > UINT32_MAX
+ else if (duration > UINT32_MAX || deadline > UINT32_MAX)
+ res = VPX_CODEC_INVALID_PARAM;
+#endif
else {
unsigned int num_enc = ctx->priv->enc.total_encoders;
diff --git a/media/libvpx/libvpx/vpx/src/vpx_image.c b/media/libvpx/libvpx/vpx/src/vpx_image.c
index f9f0dd6025..3f7ff74244 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_image.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_image.c
@@ -27,6 +27,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
if (img != NULL) memset(img, 0, sizeof(vpx_image_t));
+ if (fmt == VPX_IMG_FMT_NONE) goto fail;
+
/* Treat align==0 like align==1 */
if (!buf_align) buf_align = 1;
@@ -56,7 +58,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
/* Get chroma shift values for this format */
// For VPX_IMG_FMT_NV12, xcs needs to be 0 such that UV data is all read at
- // one time.
+ // once.
switch (fmt) {
case VPX_IMG_FMT_I420:
case VPX_IMG_FMT_YV12:
diff --git a/media/libvpx/libvpx/vpx/src/vpx_tpl.c b/media/libvpx/libvpx/vpx/src/vpx_tpl.c
index 62c2a9c857..b0687a8135 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_tpl.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_tpl.c
@@ -47,8 +47,8 @@ vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
"%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64
" %" PRId64 " %d\n",
block_stats.inter_cost, block_stats.intra_cost,
- block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist,
- block_stats.recrf_rate, block_stats.ref_frame_index));
+ block_stats.mv_c, block_stats.mv_r, block_stats.srcrf_dist,
+ block_stats.srcrf_rate, block_stats.ref_frame_index));
}
}
@@ -88,7 +88,7 @@ vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
" %" SCNd64 " %d\n",
&block_stats->inter_cost, &block_stats->intra_cost,
&block_stats->mv_c, &block_stats->mv_r,
- &block_stats->recrf_dist, &block_stats->recrf_rate,
+ &block_stats->srcrf_dist, &block_stats->srcrf_rate,
&block_stats->ref_frame_index),
7);
}
diff --git a/media/libvpx/libvpx/vpx/vp8cx.h b/media/libvpx/libvpx/vpx/vp8cx.h
index b12938d3d8..dfdbb3c770 100644
--- a/media/libvpx/libvpx/vpx/vp8cx.h
+++ b/media/libvpx/libvpx/vpx/vp8cx.h
@@ -772,6 +772,8 @@ enum vp8e_enc_control_id {
/*!\brief Codec control to use external RC to control TPL.
*
* This will use external RC to control the QP and GOP structure for TPL.
+ * (rc_type & VPX_RC_QP) in vpx_rc_funcs_t must be non zero.
+ * get_encodeframe_decision callback in vpx_rc_funcs_t also needs to be set.
*
* Supported in codecs: VP9
*/
diff --git a/media/libvpx/libvpx/vpx/vpx_encoder.h b/media/libvpx/libvpx/vpx/vpx_encoder.h
index 18e3862bd7..809a097d94 100644
--- a/media/libvpx/libvpx/vpx/vpx_encoder.h
+++ b/media/libvpx/libvpx/vpx/vpx_encoder.h
@@ -31,7 +31,6 @@ extern "C" {
#include "./vpx_codec.h" // IWYU pragma: export
#include "./vpx_ext_ratectrl.h"
-#include "./vpx_tpl.h"
/*! Temporal Scalability: Maximum length of the sequence defining frame
* layer membership
@@ -57,10 +56,15 @@ extern "C" {
* must be bumped. Examples include, but are not limited to, changing
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
+ *
+ * \note
+ * VPX_ENCODER_ABI_VERSION has a VPX_EXT_RATECTRL_ABI_VERSION component
+ * because the VP9E_SET_EXTERNAL_RATE_CONTROL codec control uses
+ * vpx_rc_funcs_t.
*/
-#define VPX_ENCODER_ABI_VERSION \
- (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \
- VPX_TPL_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION \
+ (18 + VPX_CODEC_ABI_VERSION + \
+ VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
/*! \brief Encoder capabilities bitfield
*
@@ -1074,6 +1078,12 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
* The buffer was set successfully.
* \retval #VPX_CODEC_INVALID_PARAM
* A parameter was NULL, the image format is unsupported, etc.
+ *
+ * \note
+ * `duration` and `deadline` are of the unsigned long type, which can be 32
+ * or 64 bits. `duration` and `deadline` must be less than or equal to
+ * UINT32_MAX so that their ranges are independent of the size of unsigned
+ * long.
*/
vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx,
const vpx_fixed_buf_t *buf,
diff --git a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
index 46d290dff4..ba12e4f83b 100644
--- a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
+++ b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
@@ -26,7 +26,7 @@ extern "C" {
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures.
*/
-#define VPX_EXT_RATECTRL_ABI_VERSION (7)
+#define VPX_EXT_RATECTRL_ABI_VERSION (5 + VPX_TPL_ABI_VERSION)
/*!\brief The control type of the inference API.
* In VPX_RC_QP mode, the external rate control model determines the
@@ -81,17 +81,10 @@ typedef void *vpx_rc_model_t;
*
* The encoder will receive the decision from the external rate control model
* through get_encodeframe_decision() defined in vpx_rc_funcs_t.
- *
- * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q.
- *
- * If max_frame_size = 0, the encoding ignores max frame size limit.
- * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit.
- * If the encoded frame size is larger than max_frame_size, the frame is
- * recoded to meet the size limit, following VP9's recoding principles.
*/
typedef struct vpx_rc_encodeframe_decision {
- int q_index; /**< Quantizer step index [0..255]*/
- int max_frame_size; /**< Maximal frame size allowed to encode a frame*/
+ int q_index; /**< Quantizer step index [0..255]*/
+ int rdmult; /**< Frame level Lagrangian multiplier*/
} vpx_rc_encodeframe_decision_t;
/*!\brief Information for the frame to be encoded.
@@ -322,6 +315,7 @@ typedef struct vpx_rc_config {
vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */
int overshoot_percent; /**< for VBR mode only */
int undershoot_percent; /**< for VBR mode only */
+ int base_qp; /**< base QP for leaf frames, 0-255 */
} vpx_rc_config_t;
/*!\brief Information passed to the external rate control model to
@@ -400,6 +394,7 @@ typedef struct vpx_rc_gop_info {
typedef struct vpx_rc_gop_decision {
int gop_coding_frames; /**< The number of frames of this GOP */
int use_alt_ref; /**< Whether to use alt ref for this GOP */
+ int use_key_frame; /**< Whether to set key frame for this GOP */
} vpx_rc_gop_decision_t;
/*!\brief Create an external rate control model callback prototype
@@ -446,12 +441,11 @@ typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)(
* the external rate control model.
*
* \param[in] rate_ctrl_model rate control model
- * \param[in] encode_frame_info information of the coding frame
+ * \param[in] frame_gop_index index of the frame in current gop
* \param[out] frame_decision encode decision of the coding frame
*/
typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)(
- vpx_rc_model_t rate_ctrl_model,
- const vpx_rc_encodeframe_info_t *encode_frame_info,
+ vpx_rc_model_t rate_ctrl_model, const int frame_gop_index,
vpx_rc_encodeframe_decision_t *frame_decision);
/*!\brief Update encode frame result callback prototype
@@ -472,12 +466,10 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
* the external rate control model.
*
* \param[in] rate_ctrl_model rate control model
- * \param[in] gop_info information collected from the encoder
* \param[out] gop_decision GOP decision from the model
*/
typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)(
- vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
- vpx_rc_gop_decision_t *gop_decision);
+ vpx_rc_model_t rate_ctrl_model, vpx_rc_gop_decision_t *gop_decision);
/*!\brief Get the frame rdmult from the external rate control model.
*
diff --git a/media/libvpx/libvpx/vpx/vpx_tpl.h b/media/libvpx/libvpx/vpx/vpx_tpl.h
index a250aada60..7e4c9ab7e1 100644
--- a/media/libvpx/libvpx/vpx/vpx_tpl.h
+++ b/media/libvpx/libvpx/vpx/vpx_tpl.h
@@ -32,19 +32,21 @@ extern "C" {
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
-#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/
+#define VPX_TPL_ABI_VERSION (3) /**<\hideinitializer*/
/*!\brief Temporal dependency model stats for each block before propagation */
typedef struct VpxTplBlockStats {
- int16_t row; /**< Pixel row of the top left corner */
- int16_t col; /**< Pixel col of the top left corner */
- int64_t intra_cost; /**< Intra cost */
- int64_t inter_cost; /**< Inter cost */
- int16_t mv_r; /**< Motion vector row */
- int16_t mv_c; /**< Motion vector col */
- int64_t recrf_rate; /**< Rate from reconstructed ref frame */
- int64_t recrf_dist; /**< Distortion from reconstructed ref frame */
- int ref_frame_index; /**< Ref frame index in the ref frame buffer */
+ int16_t row; /**< Pixel row of the top left corner */
+ int16_t col; /**< Pixel col of the top left corner */
+ int64_t intra_cost; /**< Intra cost */
+ int64_t inter_cost; /**< Inter cost */
+ int16_t mv_r; /**< Motion vector row */
+ int16_t mv_c; /**< Motion vector col */
+ int64_t srcrf_rate; /**< Rate from source ref frame */
+ int64_t srcrf_dist; /**< Distortion from source ref frame */
+ int64_t inter_pred_err; /**< Inter prediction error */
+ int64_t intra_pred_err; /**< Intra prediction error */
+ int ref_frame_index; /**< Ref frame index in the ref frame buffer */
} VpxTplBlockStats;
/*!\brief Temporal dependency model stats for each frame before propagation */
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
index 683df5797a..f8b94620d4 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -168,40 +168,40 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
\
if (xoffset == 0) { \
if (yoffset == 0) { \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp[w * h]; \
highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
h); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
src_stride, h, yoffset); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} \
} else if (xoffset == 4) { \
uint16_t tmp0[w * (h + 1)]; \
if (yoffset == 0) { \
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * (h + 1)]; \
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
(h + 1)); \
highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * (h + 1)]; \
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
(h + 1)); \
highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} else { \
@@ -209,21 +209,21 @@ static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
if (yoffset == 0) { \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
xoffset); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
(h + 1), xoffset); \
highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
(h + 1), xoffset); \
highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} \
@@ -430,22 +430,22 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
} while (--i != 0);
}
-#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
- uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t tmp0[w * (h + 1)]; \
- uint16_t tmp1[w * h]; \
- uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
- \
- highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
- xoffset); \
- highbd_avg_pred_var_filter_block2d_bil_w##w( \
- tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
- \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
- CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
+ uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t tmp0[w * (h + 1)]; \
+ uint16_t tmp1[w * h]; \
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
+ \
+ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+ xoffset); \
+ highbd_avg_pred_var_filter_block2d_bil_w##w( \
+ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
+ \
+ return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+ w, ref, ref_stride, sse); \
}
#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \
@@ -460,19 +460,19 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
if (yoffset == 0) { \
highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
highbd_avg_pred_var_filter_block2d_avg( \
src_ptr, tmp, source_stride, source_stride, w, h, \
CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} else { \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
src_ptr, tmp, source_stride, source_stride, h, yoffset, \
CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} \
} else if (xoffset == 4) { \
@@ -481,7 +481,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
highbd_avg_pred_var_filter_block2d_avg( \
src_ptr, tmp0, source_stride, 1, w, h, \
CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * (h + 1)]; \
@@ -489,7 +489,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
(h + 1)); \
highbd_avg_pred_var_filter_block2d_avg( \
tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * (h + 1)]; \
@@ -497,7 +497,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
(h + 1)); \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} else { \
@@ -506,7 +506,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
highbd_avg_pred_var_filter_block2d_bil_w##w( \
src_ptr, tmp0, source_stride, 1, h, xoffset, \
CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * h]; \
@@ -514,7 +514,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
(h + 1), xoffset); \
highbd_avg_pred_var_filter_block2d_avg( \
tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * h]; \
@@ -522,7 +522,7 @@ static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
(h + 1), xoffset); \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
- return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
+ return vpx_highbd_##bitdepth##_variance##w##x##h( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} \
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
new file mode 100644
index 0000000000..cebe06b099
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h) {
+ uint64x2_t sse = vdupq_n_u64(0);
+
+ do {
+ int j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr + j);
+ uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+ uint16x8_t diff = vabdq_u16(s, r);
+
+ sse = vpx_dotq_u16(sse, diff, diff);
+
+ j += 8;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ return (uint32_t)horizontal_add_uint64x2(sse);
+}
+
+#define HIGHBD_MSE_WXH_SVE(w, h) \
+ uint32_t vpx_highbd_10_mse##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint32_t sse_tmp = \
+ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h); \
+ sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 4); \
+ *sse = sse_tmp; \
+ return sse_tmp; \
+ } \
+ \
+ uint32_t vpx_highbd_12_mse##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ uint32_t sse_tmp = \
+ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h); \
+ sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 8); \
+ *sse = sse_tmp; \
+ return sse_tmp; \
+ }
+
+HIGHBD_MSE_WXH_SVE(16, 16)
+HIGHBD_MSE_WXH_SVE(16, 8)
+HIGHBD_MSE_WXH_SVE(8, 16)
+HIGHBD_MSE_WXH_SVE(8, 8)
+
+#undef HIGHBD_MSE_WXH_SVE
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h, uint64_t *sse,
+ int64_t *sum) {
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ do {
+ const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+ const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s16 = vaddq_s16(sum_s16, diff);
+ sse_s64 = vpx_dotq_s16(sse_s64, diff, diff);
+
+ src_ptr += 2 * src_stride;
+ ref_ptr += 2 * ref_stride;
+ h -= 2;
+ } while (h != 0);
+
+ *sum = horizontal_add_int16x8(sum_s16);
+ *sse = horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h, uint64_t *sse,
+ int64_t *sum) {
+ int32x4_t sum_s32 = vdupq_n_s32(0);
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ do {
+ const uint16x8_t s = vld1q_u16(src_ptr);
+ const uint16x8_t r = vld1q_u16(ref_ptr);
+
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+ sum_s32 = vpadalq_s16(sum_s32, diff);
+ sse_s64 = vpx_dotq_s16(sse_s64, diff, diff);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ *sum = horizontal_add_int32x4(sum_s32);
+ *sse = horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+ int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ const uint16x8_t s0 = vld1q_u16(src_ptr);
+ const uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+
+ const uint16x8_t r0 = vld1q_u16(ref_ptr);
+ const uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+ const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1));
+
+ sum_s32[0] = vpadalq_s16(sum_s32[0], diff0);
+ sum_s32[1] = vpadalq_s16(sum_s32[1], diff1);
+
+ sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0);
+ sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+ sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+
+ *sum = horizontal_add_int32x4(sum_s32[0]);
+ *sse = horizontal_add_int64x2(sse_s64[0]);
+}
+
+static INLINE void highbd_variance_wxh_sve(const uint16_t *src_ptr,
+ int src_stride,
+ const uint16_t *ref_ptr,
+ int ref_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+ int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+ vdupq_n_s64(0) };
+
+ do {
+ int i = 0;
+ do {
+ const uint16x8_t s0 = vld1q_u16(src_ptr + i);
+ const uint16x8_t s1 = vld1q_u16(src_ptr + i + 8);
+ const uint16x8_t s2 = vld1q_u16(src_ptr + i + 16);
+ const uint16x8_t s3 = vld1q_u16(src_ptr + i + 24);
+
+ const uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+ const uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+ const uint16x8_t r2 = vld1q_u16(ref_ptr + i + 16);
+ const uint16x8_t r3 = vld1q_u16(ref_ptr + i + 24);
+
+ const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+ const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1));
+ const int16x8_t diff2 = vreinterpretq_s16_u16(vsubq_u16(s2, r2));
+ const int16x8_t diff3 = vreinterpretq_s16_u16(vsubq_u16(s3, r3));
+
+ sum_s32[0] = vpadalq_s16(sum_s32[0], diff0);
+ sum_s32[1] = vpadalq_s16(sum_s32[1], diff1);
+ sum_s32[2] = vpadalq_s16(sum_s32[2], diff2);
+ sum_s32[3] = vpadalq_s16(sum_s32[3], diff3);
+
+ sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0);
+ sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1);
+ sse_s64[2] = vpx_dotq_s16(sse_s64[2], diff2, diff2);
+ sse_s64[3] = vpx_dotq_s16(sse_s64[3], diff3, diff3);
+
+ i += 32;
+ } while (i < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--h != 0);
+
+ sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+ sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]);
+ sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[2]);
+
+ sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+ sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]);
+ sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[2]);
+
+ *sum = horizontal_add_int32x4(sum_s32[0]);
+ *sse = horizontal_add_int64x2(sse_s64[0]);
+}
+
+static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int h, uint64_t *sse,
+ int64_t *sum) {
+ highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+#define HBD_VARIANCE_WXH_SVE(w, h) \
+ uint32_t vpx_highbd_8_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ sum = (int)sum_long; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \
+ } \
+ \
+ uint32_t vpx_highbd_10_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t vpx_highbd_12_variance##w##x##h##_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+HBD_VARIANCE_WXH_SVE(4, 4)
+HBD_VARIANCE_WXH_SVE(4, 8)
+
+HBD_VARIANCE_WXH_SVE(8, 4)
+HBD_VARIANCE_WXH_SVE(8, 8)
+HBD_VARIANCE_WXH_SVE(8, 16)
+
+HBD_VARIANCE_WXH_SVE(16, 8)
+HBD_VARIANCE_WXH_SVE(16, 16)
+HBD_VARIANCE_WXH_SVE(16, 32)
+
+HBD_VARIANCE_WXH_SVE(32, 16)
+HBD_VARIANCE_WXH_SVE(32, 32)
+HBD_VARIANCE_WXH_SVE(32, 64)
+
+HBD_VARIANCE_WXH_SVE(64, 32)
+HBD_VARIANCE_WXH_SVE(64, 64)
+
+#define HIGHBD_GET_VAR_SVE(s) \
+ void vpx_highbd_8_get##s##x##s##var_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)sse_long; \
+ *sum = (int)sum_long; \
+ } \
+ \
+ void vpx_highbd_10_get##s##x##s##var_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \
+ } \
+ \
+ void vpx_highbd_12_get##s##x##s##var_sve( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, uint32_t *sse, int *sum) { \
+ uint64_t sse_long = 0; \
+ int64_t sum_long = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
+ highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \
+ &sse_long, &sum_long); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \
+ *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \
+ }
+
+HIGHBD_GET_VAR_SVE(8)
+HIGHBD_GET_VAR_SVE(16)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index 47684473ca..b5a944d299 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -14,86 +14,51 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
-static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
- int16x4_t *const s0, int16x4_t *const s1,
- int16x4_t *const s2, int16x4_t *const s3) {
- *s0 = vld1_s16(s);
- s += p;
- *s1 = vld1_s16(s);
- s += p;
- *s2 = vld1_s16(s);
- s += p;
- *s3 = vld1_s16(s);
-}
-
-static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
- uint16x8_t *const s0, uint16x8_t *const s1,
- uint16x8_t *const s2, uint16x8_t *const s3) {
- *s0 = vld1q_u16(s);
- s += p;
- *s1 = vld1q_u16(s);
- s += p;
- *s2 = vld1q_u16(s);
- s += p;
- *s3 = vld1q_u16(s);
-}
-
-static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
- int16x8_t *const s0, int16x8_t *const s1,
- int16x8_t *const s2, int16x8_t *const s3,
- int16x8_t *const s4, int16x8_t *const s5,
- int16x8_t *const s6, int16x8_t *const s7) {
- *s0 = vld1q_s16(s);
- s += p;
- *s1 = vld1q_s16(s);
- s += p;
- *s2 = vld1q_s16(s);
- s += p;
- *s3 = vld1q_s16(s);
- s += p;
- *s4 = vld1q_s16(s);
- s += p;
- *s5 = vld1q_s16(s);
- s += p;
- *s6 = vld1q_s16(s);
- s += p;
- *s7 = vld1q_s16(s);
+static INLINE uint16x4_t highbd_convolve4_4(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t filters, const uint16x4_t max) {
+ int32x4_t sum = vmull_lane_s16(s0, filters, 0);
+ sum = vmlal_lane_s16(sum, s1, filters, 1);
+ sum = vmlal_lane_s16(sum, s2, filters, 2);
+ sum = vmlal_lane_s16(sum, s3, filters, 3);
+
+ uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+ return vmin_u16(res, max);
}
-static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
- const uint16x8_t s0, const uint16x8_t s1,
- const uint16x8_t s2, const uint16x8_t s3,
- const uint16x8_t s4, const uint16x8_t s5,
- const uint16x8_t s6, const uint16x8_t s7) {
- vst1q_u16(s, s0);
- s += p;
- vst1q_u16(s, s1);
- s += p;
- vst1q_u16(s, s2);
- s += p;
- vst1q_u16(s, s3);
- s += p;
- vst1q_u16(s, s4);
- s += p;
- vst1q_u16(s, s5);
- s += p;
- vst1q_u16(s, s6);
- s += p;
- vst1q_u16(s, s7);
+static INLINE uint16x8_t highbd_convolve4_8(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x4_t filters, const uint16x8_t max) {
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters, 0);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters, 1);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters, 2);
+ sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters, 3);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters, 0);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters, 1);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters, 2);
+ sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters, 3);
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+ return vminq_u16(res, max);
}
-static INLINE int32x4_t highbd_convolve8_4(
- const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
- const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
- const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
+static INLINE uint16x4_t
+highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16x8_t filters, const uint16x4_t max) {
const int16x4_t filters_lo = vget_low_s16(filters);
const int16x4_t filters_hi = vget_high_s16(filters);
- int32x4_t sum;
- sum = vmull_lane_s16(s0, filters_lo, 0);
+ int32x4_t sum = vmull_lane_s16(s0, filters_lo, 0);
sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
@@ -101,7 +66,9 @@ static INLINE int32x4_t highbd_convolve8_4(
sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
- return sum;
+
+ uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+ return vmin_u16(res, max);
}
static INLINE uint16x8_t
@@ -111,10 +78,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
const int16x8_t filters, const uint16x8_t max) {
const int16x4_t filters_lo = vget_low_s16(filters);
const int16x4_t filters_hi = vget_high_s16(filters);
- int32x4_t sum0, sum1;
- uint16x8_t d;
- sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
+ int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
@@ -122,7 +87,8 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
- sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
+
+ int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
@@ -130,9 +96,152 @@ highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
- d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
- d = vminq_u16(d, max);
- return d;
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+ vqrshrun_n_s32(sum1, FILTER_BITS));
+ return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve_4tap_horiz_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 =
+ highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], filter, max);
+ uint16x4_t d1 =
+ highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], filter, max);
+ uint16x4_t d2 =
+ highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], filter, max);
+ uint16x4_t d3 =
+ highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x8_t d0 =
+ highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], filter, max);
+ uint16x8_t d1 =
+ highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], filter, max);
+ uint16x8_t d2 =
+ highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], filter, max);
+ uint16x8_t d3 =
+ highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+static INLINE void highbd_convolve_8tap_horiz_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x4_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], s0[6], s0[7], filter, max);
+ uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], s1[6], s1[7], filter, max);
+ uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], s2[6], s2[7], filter, max);
+ uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], s3[6], s3[7], filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], s0[6], s0[7], filter, max);
+ uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], s1[6], s1[7], filter, max);
+ uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], s2[6], s2[7], filter, max);
+ uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], s3[6], s3[7], filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
}
void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
@@ -143,202 +252,25 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
if (x_step_q4 != 16) {
vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
- } else {
- const int16x8_t filters = vld1q_s16(filter[x0_q4]);
- const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
- uint16x8_t t0, t1, t2, t3;
-
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
-
- src -= 3;
-
- if (h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int32x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23;
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u16_8x4(&t0, &t1, &t2, &t3);
- s0 = vreinterpret_s16_u16(vget_low_u16(t0));
- s1 = vreinterpret_s16_u16(vget_low_u16(t1));
- s2 = vreinterpret_s16_u16(vget_low_u16(t2));
- s3 = vreinterpret_s16_u16(vget_low_u16(t3));
- s4 = vreinterpret_s16_u16(vget_high_u16(t0));
- s5 = vreinterpret_s16_u16(vget_high_u16(t1));
- s6 = vreinterpret_s16_u16(vget_high_u16(t2));
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- src += 7;
-
- do {
- load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
- transpose_s16_4x4d(&s7, &s8, &s9, &s10);
-
- d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ return;
+ }
- d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
- d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
- d01 = vminq_u16(d01, max);
- d23 = vminq_u16(d23, max);
- transpose_u16_4x4q(&d01, &d23);
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+ assert(x_step_q4 == 16);
- vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
- vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
- vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- src += 4;
- dst += 4;
- w -= 4;
- } while (w > 0);
- } else {
- int16x8_t t4, t5, t6, t7;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3;
-
- if (w == 4) {
- do {
- load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
- &s5, &s6, &s7);
- transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
- load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
- &t4, &t5, &t6, &t7);
- src += 8 * src_stride;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
- transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- d3 =
- highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
- transpose_u16_8x4(&d0, &d1, &d2, &d3);
- vst1_u16(dst, vget_low_u16(d0));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d1));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d2));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d3));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d0));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d1));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d2));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d3));
- dst += dst_stride;
- h -= 8;
- } while (h > 0);
- } else {
- int width;
- const uint16_t *s;
- uint16_t *d;
- int16x8_t s11, s12, s13, s14;
- uint16x8_t d4, d5, d6, d7;
-
- do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
- &s5, &s6, &s7);
- transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
- width = w;
- s = src + 7;
- d = dst;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
-
- do {
- load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
- &s12, &s13, &s14);
- transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
-
- d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
- max);
- d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
- max);
- d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
- max);
- d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
- max);
- d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
- max);
- d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
- max);
- d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
- max);
- d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
- filters, max);
-
- transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
- store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 8 * src_stride;
- dst += 8 * dst_stride;
- h -= 8;
- } while (h > 0);
- }
- }
+ if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+ const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
+ highbd_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+ x_filter_4tap, bd);
+ } else {
+ const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+ highbd_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+ x_filter_8tap, bd);
}
}
@@ -352,66 +284,233 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
bd);
+ return;
+ }
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x4_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], s0[6], s0[7], filters, max);
+ uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], s1[6], s1[7], filters, max);
+ uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], s2[6], s2[7], filters, max);
+ uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], s3[6], s3[7], filters, max);
+
+ d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+ d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+ d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+ d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+ s0[5], s0[6], s0[7], filters, max);
+ uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+ s1[5], s1[6], s1[7], filters, max);
+ uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+ s2[5], s2[6], s2[7], filters, max);
+ uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+ s3[5], s3[6], s3[7], filters, max);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+static INLINE void highbd_convolve_4tap_vert_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t s0, s1, s2;
+ load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+
+ s += 3 * src_stride;
+
+ do {
+ int16x4_t s3, s4, s5, s6;
+ load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+ uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, filter, max);
+ uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, filter, max);
+ uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, filter, max);
+ uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
} else {
- const int16x8_t filters = vld1q_s16(filter[x0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
-
- src -= 3;
-
- if (h == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int32x4_t d0, d1, d2, d3;
- uint16x8_t t0, t1, t2, t3;
- uint16x8_t d01, d23, t01, t23;
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u16_8x4(&t0, &t1, &t2, &t3);
- s0 = vreinterpret_s16_u16(vget_low_u16(t0));
- s1 = vreinterpret_s16_u16(vget_low_u16(t1));
- s2 = vreinterpret_s16_u16(vget_low_u16(t2));
- s3 = vreinterpret_s16_u16(vget_low_u16(t3));
- s4 = vreinterpret_s16_u16(vget_high_u16(t0));
- s5 = vreinterpret_s16_u16(vget_high_u16(t1));
- s6 = vreinterpret_s16_u16(vget_high_u16(t2));
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- src += 7;
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t s0, s1, s2;
+ load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+
+ s += 3 * src_stride;
do {
- load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
- transpose_s16_4x4d(&s7, &s8, &s9, &s10);
-
- d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
- t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
- t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
- t01 = vminq_u16(t01, max);
- t23 = vminq_u16(t23, max);
- transpose_u16_4x4q(&t01, &t23);
-
- d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
- vld1_u16(dst + 2 * dst_stride));
- d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
- vld1_u16(dst + 3 * dst_stride));
- d01 = vrhaddq_u16(d01, t01);
- d23 = vrhaddq_u16(d23, t23);
-
- vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
- vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
- vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
- vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+ int16x8_t s3, s4, s5, s6;
+ load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+ uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, filter, max);
+ uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, filter, max);
+ uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, filter, max);
+ uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void highbd_convolve_8tap_vert_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 =
+ highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
+ uint16x4_t d1 =
+ highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
+ uint16x4_t d2 =
+ highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
+ uint16x4_t d3 =
+ highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
+ uint16x8_t d1 =
+ highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
+ uint16x8_t d2 =
+ highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
+ uint16x8_t d3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -420,164 +519,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
s4 = s8;
s5 = s9;
s6 = s10;
- src += 4;
- dst += 4;
- w -= 4;
- } while (w > 0);
- } else {
- int16x8_t t4, t5, t6, t7;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
-
- if (w == 4) {
- do {
- load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
- &s5, &s6, &s7);
- transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
- load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
- &t4, &t5, &t6, &t7);
- src += 8 * src_stride;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
- transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- t3 =
- highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
- transpose_u16_8x4(&t0, &t1, &t2, &t3);
-
- d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
- vld1_u16(dst + 4 * dst_stride));
- d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
- vld1_u16(dst + 5 * dst_stride));
- d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
- vld1_u16(dst + 6 * dst_stride));
- d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
- vld1_u16(dst + 7 * dst_stride));
- d0 = vrhaddq_u16(d0, t0);
- d1 = vrhaddq_u16(d1, t1);
- d2 = vrhaddq_u16(d2, t2);
- d3 = vrhaddq_u16(d3, t3);
-
- vst1_u16(dst, vget_low_u16(d0));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d1));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d2));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d3));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d0));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d1));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d2));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d3));
- dst += dst_stride;
- h -= 8;
- } while (h > 0);
- } else {
- int width;
- const uint16_t *s;
- uint16_t *d;
- int16x8_t s11, s12, s13, s14;
- uint16x8_t d4, d5, d6, d7;
-
- do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
- &s5, &s6, &s7);
- transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
- width = w;
- s = src + 7;
- d = dst;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
-
- do {
- load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
- &s12, &s13, &s14);
- transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
-
- d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
- max);
- d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
- max);
- d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
- max);
- d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
- max);
- d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
- max);
- d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
- max);
- d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
- max);
- d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
- filters, max);
-
- transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
- d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
- d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
- d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
- d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
- d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
- d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
- d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
- d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
-
- store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 8 * src_stride;
- dst += 8 * dst_stride;
- h -= 8;
- } while (h > 0);
- }
- }
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
}
}
@@ -589,160 +538,25 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
if (y_step_q4 != 16) {
vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
x_step_q4, y0_q4, y_step_q4, w, h, bd);
- } else {
- const int16x8_t filters = vld1q_s16(filter[y0_q4]);
- const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
-
- src -= 3 * src_stride;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int32x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23;
-
- s0 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s1 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s2 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s3 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s4 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s5 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s6 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
+ return;
+ }
- do {
- s7 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s8 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s9 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s10 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
- d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
- d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
- d01 = vminq_u16(d01, max);
- d23 = vminq_u16(d23, max);
- vst1_u16(dst, vget_low_u16(d01));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d01));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d23));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d23));
- dst += dst_stride;
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+ assert(y_step_q4 == 16);
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- h -= 4;
- } while (h > 0);
- } else {
- int height;
- const uint16_t *s;
- uint16_t *d;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3;
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
- do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- s = src;
- s0 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s1 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s2 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s3 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s4 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s5 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s6 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- d = dst;
- height = h;
-
- do {
- s7 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s8 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s9 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s10 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
- d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- d3 =
- highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
- vst1q_u16(d, d0);
- d += dst_stride;
- vst1q_u16(d, d1);
- d += dst_stride;
- vst1q_u16(d, d2);
- d += dst_stride;
- vst1q_u16(d, d3);
- d += dst_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- height -= 4;
- } while (height > 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w > 0);
- }
+ if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+ const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2);
+ highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst,
+ dst_stride, w, h, y_filter_4tap, bd);
+ } else {
+ const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+ highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
+ dst_stride, w, h, y_filter_8tap, bd);
}
}
@@ -756,78 +570,89 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
bd);
+ return;
+ }
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+ src -= 3 * src_stride;
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t s7, s8, s9, s10;
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x4_t d0 =
+ highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ uint16x4_t d1 =
+ highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ uint16x4_t d2 =
+ highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ uint16x4_t d3 =
+ highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+ d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+ d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+ d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+ d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
} else {
- const int16x8_t filters = vld1q_s16(filter[y0_q4]);
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
- assert(!((intptr_t)dst & 3));
- assert(!(dst_stride & 3));
-
- src -= 3 * src_stride;
-
- if (w == 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int32x4_t d0, d1, d2, d3;
- uint16x8_t d01, d23, t01, t23;
-
- s0 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s1 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s2 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s3 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s4 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s5 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s6 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+ s += 7 * src_stride;
do {
- s7 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s8 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s9 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- s10 = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
- t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
- t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
- t01 = vminq_u16(t01, max);
- t23 = vminq_u16(t23, max);
-
- d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
- vld1_u16(dst + 1 * dst_stride));
- d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
- vld1_u16(dst + 3 * dst_stride));
- d01 = vrhaddq_u16(d01, t01);
- d23 = vrhaddq_u16(d23, t23);
-
- vst1_u16(dst, vget_low_u16(d01));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d01));
- dst += dst_stride;
- vst1_u16(dst, vget_low_u16(d23));
- dst += dst_stride;
- vst1_u16(dst, vget_high_u16(d23));
- dst += dst_stride;
+ int16x8_t s7, s8, s9, s10;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+ uint16x8_t d0 =
+ highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+ uint16x8_t d1 =
+ highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+ uint16x8_t d2 =
+ highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+ uint16x8_t d3 =
+ highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
s0 = s4;
s1 = s5;
@@ -836,96 +661,592 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
s4 = s8;
s5 = s9;
s6 = s10;
- h -= 4;
- } while (h > 0);
- } else {
- int height;
- const uint16_t *s;
- uint16_t *d;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
- do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- s = src;
- s0 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s1 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s2 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s3 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s4 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s5 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s6 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- d = dst;
- height = h;
-
- do {
- s7 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s8 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s9 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
- s10 = vreinterpretq_s16_u16(vld1q_u16(s));
- s += src_stride;
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
- t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
- t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
- t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
- t3 =
- highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
- d0 = vld1q_u16(d + 0 * dst_stride);
- d1 = vld1q_u16(d + 1 * dst_stride);
- d2 = vld1q_u16(d + 2 * dst_stride);
- d3 = vld1q_u16(d + 3 * dst_stride);
- d0 = vrhaddq_u16(d0, t0);
- d1 = vrhaddq_u16(d1, t1);
- d2 = vrhaddq_u16(d2, t2);
- d3 = vrhaddq_u16(d3, t3);
-
- vst1q_u16(d, d0);
- d += dst_stride;
- vst1q_u16(d, d1);
- d += dst_stride;
- vst1q_u16(d, d2);
- d += dst_stride;
- vst1q_u16(d, d3);
- d += dst_stride;
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- height -= 4;
- } while (height > 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w > 0);
- }
+static INLINE void highbd_convolve_2d_4tap_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+ const int16x4_t y_filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t h_s0[4], h_s1[4], h_s2[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+ int16x4_t v_s0 = vreinterpret_s16_u16(
+ highbd_convolve4_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
+ int16x4_t v_s1 = vreinterpret_s16_u16(
+ highbd_convolve4_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
+ int16x4_t v_s2 = vreinterpret_s16_u16(
+ highbd_convolve4_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
+
+ s += 3 * src_stride;
+
+ do {
+ int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+ &h_s3[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+ &h_s4[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+ &h_s5[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+ &h_s6[3]);
+
+ int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4(
+ h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
+ int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4(
+ h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
+ int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4(
+ h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
+ int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4(
+ h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
+
+ uint16x4_t d0 = highbd_convolve4_4(v_s0, v_s1, v_s2, v_s3, y_filter, max);
+ uint16x4_t d1 = highbd_convolve4_4(v_s1, v_s2, v_s3, v_s4, y_filter, max);
+ uint16x4_t d2 = highbd_convolve4_4(v_s2, v_s3, v_s4, v_s5, y_filter, max);
+ uint16x4_t d3 = highbd_convolve4_4(v_s3, v_s4, v_s5, v_s6, y_filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ return;
+ }
+
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t h_s0[4], h_s1[4], h_s2[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+ int16x8_t v_s0 = vreinterpretq_s16_u16(
+ highbd_convolve4_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
+ int16x8_t v_s1 = vreinterpretq_s16_u16(
+ highbd_convolve4_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
+ int16x8_t v_s2 = vreinterpretq_s16_u16(
+ highbd_convolve4_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
+
+ s += 3 * src_stride;
+
+ do {
+ int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+ &h_s3[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+ &h_s4[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+ &h_s5[3]);
+ load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+ &h_s6[3]);
+
+ int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8(
+ h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
+ int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8(
+ h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
+ int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8(
+ h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
+ int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8(
+ h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
+
+ uint16x8_t d0 = highbd_convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter, max);
+ uint16x8_t d1 = highbd_convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter, max);
+ uint16x8_t d2 = highbd_convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter, max);
+ uint16x8_t d3 = highbd_convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+}
+
+static INLINE void highbd_convolve_2d_8tap_neon(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter,
+ const int16x8_t y_filter, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+ &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+ &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+ &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+ &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+ load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+ &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+ load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+ &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+ load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+ &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+ int16x4_t v_s0 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+ h_s0[6], h_s0[7], x_filter, max));
+ int16x4_t v_s1 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+ h_s1[6], h_s1[7], x_filter, max));
+ int16x4_t v_s2 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+ h_s2[6], h_s2[7], x_filter, max));
+ int16x4_t v_s3 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+ h_s3[6], h_s3[7], x_filter, max));
+ int16x4_t v_s4 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+ h_s4[6], h_s4[7], x_filter, max));
+ int16x4_t v_s5 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+ h_s5[6], h_s5[7], x_filter, max));
+ int16x4_t v_s6 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+ h_s6[6], h_s6[7], x_filter, max));
+
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+ &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+ &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+ &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+ &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+ int16x4_t v_s7 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+ h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+ int16x4_t v_s8 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+ h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+ int16x4_t v_s9 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+ h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+ int16x4_t v_s10 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+ h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+ uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+ v_s6, v_s7, y_filter, max);
+ uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+ v_s7, v_s8, y_filter, max);
+ uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+ v_s8, v_s9, y_filter, max);
+ uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+ v_s9, v_s10, y_filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ v_s3 = v_s7;
+ v_s4 = v_s8;
+ v_s5 = v_s9;
+ v_s6 = v_s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ return;
+ }
+
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+ &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+ &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+ &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+ &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+ load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+ &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+ load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+ &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+ load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+ &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+ int16x8_t v_s0 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+ h_s0[6], h_s0[7], x_filter, max));
+ int16x8_t v_s1 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+ h_s1[6], h_s1[7], x_filter, max));
+ int16x8_t v_s2 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+ h_s2[6], h_s2[7], x_filter, max));
+ int16x8_t v_s3 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+ h_s3[6], h_s3[7], x_filter, max));
+ int16x8_t v_s4 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+ h_s4[6], h_s4[7], x_filter, max));
+ int16x8_t v_s5 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+ h_s5[6], h_s5[7], x_filter, max));
+ int16x8_t v_s6 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+ h_s6[6], h_s6[7], x_filter, max));
+
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+ &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+ &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+ &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+ &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+ int16x8_t v_s7 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+ h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+ int16x8_t v_s8 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+ h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+ int16x8_t v_s9 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+ h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+ int16x8_t v_s10 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+ h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+ uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+ v_s6, v_s7, y_filter, max);
+ uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+ v_s7, v_s8, y_filter, max);
+ uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+ v_s8, v_s9, y_filter, max);
+ uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+ v_s9, v_s10, y_filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ v_s3 = v_s7;
+ v_s4 = v_s8;
+ v_s5 = v_s9;
+ v_s6 = v_s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+}
+
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h, int bd) {
+ if (x_step_q4 != 16 || y_step_q4 != 16) {
+ vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ return;
+ }
+
+ const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+ const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+ // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+ // lines post both horizontally and vertically.
+ const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+ const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+ if (x_filter_taps == 4 && y_filter_taps == 4) {
+ const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+ const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2);
+
+ highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride,
+ dst, dst_stride, w, h, x_filter, y_filter, bd);
+ return;
+ }
+
+ const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride,
+ dst, dst_stride, w, h, x_filter, y_filter, bd);
+}
+
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16 || y_step_q4 != 16) {
+ vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ return;
}
+
+ // Averaging convolution always uses an 8-tap filter.
+ const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
+ const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
+ // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
+ // lines post both horizontally and vertically.
+ src = src - horiz_offset - vert_offset;
+
+ const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+ &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+ &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+ &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+ &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+ load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+ &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+ load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+ &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+ load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+ &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+ int16x4_t v_s0 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+ h_s0[6], h_s0[7], x_filter, max));
+ int16x4_t v_s1 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+ h_s1[6], h_s1[7], x_filter, max));
+ int16x4_t v_s2 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+ h_s2[6], h_s2[7], x_filter, max));
+ int16x4_t v_s3 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+ h_s3[6], h_s3[7], x_filter, max));
+ int16x4_t v_s4 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+ h_s4[6], h_s4[7], x_filter, max));
+ int16x4_t v_s5 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+ h_s5[6], h_s5[7], x_filter, max));
+ int16x4_t v_s6 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+ h_s6[6], h_s6[7], x_filter, max));
+
+ s += 7 * src_stride;
+
+ do {
+ int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+ load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+ &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+ load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+ &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+ load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+ &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+ load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+ &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+ int16x4_t v_s7 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+ h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+ int16x4_t v_s8 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+ h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+ int16x4_t v_s9 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+ h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+ int16x4_t v_s10 = vreinterpret_s16_u16(
+ highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+ h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+ uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+ v_s6, v_s7, y_filter, max);
+ uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+ v_s7, v_s8, y_filter, max);
+ uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+ v_s8, v_s9, y_filter, max);
+ uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+ v_s9, v_s10, y_filter, max);
+
+ d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+ d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+ d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+ d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ v_s3 = v_s7;
+ v_s4 = v_s8;
+ v_s5 = v_s9;
+ v_s6 = v_s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+
+ return;
+ }
+
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+ &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+ &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+ &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+ &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+ load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+ &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+ load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+ &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+ load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+ &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+ int16x8_t v_s0 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+ h_s0[6], h_s0[7], x_filter, max));
+ int16x8_t v_s1 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+ h_s1[6], h_s1[7], x_filter, max));
+ int16x8_t v_s2 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+ h_s2[6], h_s2[7], x_filter, max));
+ int16x8_t v_s3 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+ h_s3[6], h_s3[7], x_filter, max));
+ int16x8_t v_s4 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+ h_s4[6], h_s4[7], x_filter, max));
+ int16x8_t v_s5 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+ h_s5[6], h_s5[7], x_filter, max));
+ int16x8_t v_s6 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+ h_s6[6], h_s6[7], x_filter, max));
+
+ s += 7 * src_stride;
+
+ do {
+ int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+ &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+ &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+ &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+ &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+ int16x8_t v_s7 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+ h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+ int16x8_t v_s8 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+ h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+ int16x8_t v_s9 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+ h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+ int16x8_t v_s10 = vreinterpretq_s16_u16(
+ highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+ h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+ uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+ v_s6, v_s7, y_filter, max);
+ uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+ v_s7, v_s8, y_filter, max);
+ uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+ v_s8, v_s9, y_filter, max);
+ uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+ v_s9, v_s10, y_filter, max);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ v_s3 = v_s7;
+ v_s4 = v_s8;
+ v_s5 = v_s9;
+ v_s6 = v_s10;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
new file mode 100644
index 0000000000..7fc0a57c90
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6,
+ 1, 3, 5, 7 };
+
+static INLINE uint16x4_t highbd_convolve4_4(const int16x4_t s[4],
+ const int16x8_t filter,
+ const uint16x4_t max) {
+ int16x8_t s01 = vcombine_s16(s[0], s[1]);
+ int16x8_t s23 = vcombine_s16(s[2], s[3]);
+
+ int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s01, filter, 0);
+ int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s23, filter, 0);
+
+ int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+ uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS);
+ return vmin_u16(res_u16, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8(const int16x8_t s[4],
+ const int16x8_t filter,
+ const uint16x8_t max,
+ uint16x8_t idx) {
+ int64x2_t sum04 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
+ int64x2_t sum15 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
+ int64x2_t sum26 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
+ int64x2_t sum37 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
+
+ int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
+ int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+ vqrshrun_n_s32(res1, FILTER_BITS));
+
+ res = vpx_tbl_u16(res, idx);
+
+ return vminq_u16(res, max);
+}
+
+static INLINE uint16x4_t highbd_convolve8_4(const int16x8_t s[4],
+ const int16x8_t filter,
+ const uint16x4_t max) {
+ int64x2_t sum[4];
+
+ sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter);
+ sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter);
+ sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter);
+ sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter);
+
+ sum[0] = vpaddq_s64(sum[0], sum[1]);
+ sum[2] = vpaddq_s64(sum[2], sum[3]);
+
+ int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
+
+ uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS);
+ return vmin_u16(res_u16, max);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8(const int16x8_t s[8],
+ const int16x8_t filter,
+ const uint16x8_t max) {
+ int64x2_t sum[8];
+
+ sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter);
+ sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter);
+ sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter);
+ sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter);
+ sum[4] = vpx_dotq_s16(vdupq_n_s64(0), s[4], filter);
+ sum[5] = vpx_dotq_s16(vdupq_n_s64(0), s[5], filter);
+ sum[6] = vpx_dotq_s16(vdupq_n_s64(0), s[6], filter);
+ sum[7] = vpx_dotq_s16(vdupq_n_s64(0), s[7], filter);
+
+ int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]);
+ int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]);
+ int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]);
+ int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]);
+
+ int32x4_t res0 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+ int32x4_t res1 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+ vqrshrun_n_s32(res1, FILTER_BITS));
+ return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve_4tap_horiz_sve(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x4_t filters, int bd) {
+ const int16x8_t filter = vcombine_s16(filters, vdup_n_s16(0));
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x4_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve4_4(s0, filter, max);
+ uint16x4_t d1 = highbd_convolve4_4(s1, filter, max);
+ uint16x4_t d2 = highbd_convolve4_4(s2, filter, max);
+ uint16x4_t d3 = highbd_convolve4_4(s3, filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+ const uint16x8_t idx = vld1q_u16(kTblConv4_8);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x8_t d0 = highbd_convolve4_8(s0, filter, max, idx);
+ uint16x8_t d1 = highbd_convolve4_8(s1, filter, max, idx);
+ uint16x8_t d2 = highbd_convolve4_8(s2, filter, max, idx);
+ uint16x8_t d3 = highbd_convolve4_8(s3, filter, max, idx);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+static INLINE void highbd_convolve_8tap_horiz_sve(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x8_t filters, int bd) {
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x8_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+ uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+ uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+ uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+ uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+ uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+ uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
+
+void vpx_highbd_convolve8_horiz_sve(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ return;
+ }
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+ assert(x_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+ const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
+ highbd_convolve_4tap_horiz_sve(src - 1, src_stride, dst, dst_stride, w, h,
+ x_filter_4tap, bd);
+ } else {
+ const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+ highbd_convolve_8tap_horiz_sve(src - 3, src_stride, dst, dst_stride, w, h,
+ x_filter_8tap, bd);
+ }
+}
+
+void vpx_highbd_convolve8_avg_horiz_sve(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (x_step_q4 != 16) {
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+ return;
+ }
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+ src -= 3;
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ do {
+ int16x8_t s0[4], s1[4], s2[4], s3[4];
+ load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+ load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+ load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+ load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+ uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+ uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+ uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+ uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+ d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+ d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+ d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+ d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int width = w;
+
+ do {
+ int16x8_t s0[8], s1[8], s2[8], s3[8];
+ load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+ &s0[4], &s0[5], &s0[6], &s0[7]);
+ load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+ &s1[4], &s1[5], &s1[6], &s1[7]);
+ load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+ &s2[4], &s2[5], &s2[6], &s2[7]);
+ load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+ &s3[4], &s3[5], &s3[6], &s3[7]);
+
+ uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+ uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+ uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+ uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
new file mode 100644
index 0000000000..4ed7718f7d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h"
+
+// clang-format off
+DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = {
+ // Shift left and insert new last column in transposed 4x4 block.
+ 1, 2, 3, 0, 5, 6, 7, 4,
+ // Shift left and insert two new columns in transposed 4x4 block.
+ 2, 3, 0, 1, 6, 7, 4, 5,
+ // Shift left and insert three new columns in transposed 4x4 block.
+ 3, 0, 1, 2, 7, 4, 5, 6,
+};
+// clang-format on
+
+static INLINE void transpose_concat_4x4(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ int16x8_t res[2]) {
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03
+ // s1: 10, 11, 12, 13
+ // s2: 20, 21, 22, 23
+ // s3: 30, 31, 32, 33
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+
+ int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
+ int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
+ int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
+ int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
+
+ int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q));
+ int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q));
+
+ int32x4x2_t t0123 = vzipq_s32(s01, s23);
+
+ res[0] = vreinterpretq_s16_s32(t0123.val[0]);
+ res[1] = vreinterpretq_s16_s32(t0123.val[1]);
+}
+
+static INLINE void transpose_concat_8x4(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ int16x8_t res[4]) {
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03, 04, 05, 06, 07
+ // s1: 10, 11, 12, 13, 14, 15, 16, 17
+ // s2: 20, 21, 22, 23, 24, 25, 26, 27
+ // s3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+ // res[2]: 04 14 24 34 05 15 25 35
+ // res[3]: 06 16 26 36 07 17 27 37
+
+ int16x8x2_t s01 = vzipq_s16(s0, s1);
+ int16x8x2_t s23 = vzipq_s16(s2, s3);
+
+ int32x4x2_t t0123_lo = vzipq_s32(vreinterpretq_s32_s16(s01.val[0]),
+ vreinterpretq_s32_s16(s23.val[0]));
+ int32x4x2_t t0123_hi = vzipq_s32(vreinterpretq_s32_s16(s01.val[1]),
+ vreinterpretq_s32_s16(s23.val[1]));
+
+ res[0] = vreinterpretq_s16_s32(t0123_lo.val[0]);
+ res[1] = vreinterpretq_s16_s32(t0123_lo.val[1]);
+ res[2] = vreinterpretq_s16_s32(t0123_hi.val[0]);
+ res[3] = vreinterpretq_s16_s32(t0123_hi.val[1]);
+}
+
+static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4],
+ int16x8_t res[4], uint16x8_t idx) {
+ res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
+ res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
+ res[2] = vpx_tbl2_s16(s0[2], s1[2], idx);
+ res[3] = vpx_tbl2_s16(s0[3], s1[3], idx);
+}
+
+static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2],
+ int16x8_t res[2], uint16x8_t idx) {
+ res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
+ res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t s_lo[2],
+ int16x8_t s_hi[2],
+ int16x8_t filter,
+ uint16x4_t max) {
+ int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
+ sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
+
+ int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
+ sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
+
+ int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+ uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
+ return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_v(const int16x8_t s_lo[4],
+ const int16x8_t s_hi[4],
+ const int16x8_t filter,
+ const uint16x8_t max) {
+ int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
+ sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
+
+ int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
+ sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
+
+ int64x2_t sum45 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[2], filter, 0);
+ sum45 = vpx_dotq_lane_s16(sum45, s_hi[2], filter, 1);
+
+ int64x2_t sum67 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[3], filter, 0);
+ sum67 = vpx_dotq_lane_s16(sum67, s_hi[3], filter, 1);
+
+ int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+ int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+ uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
+ vqrshrun_n_s32(sum4567, FILTER_BITS));
+ return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve8_8tap_vert_sve2(
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+ assert(w >= 4 && h >= 4);
+ uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
+
+ // Correct indices by the size of vector length.
+ merge_tbl_idx.val[0] = vaddq_u16(
+ merge_tbl_idx.val[0],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
+ merge_tbl_idx.val[1] = vaddq_u16(
+ merge_tbl_idx.val[1],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
+ merge_tbl_idx.val[2] = vaddq_u16(
+ merge_tbl_idx.val[2],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+ transpose_concat_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+ do {
+ int16x4_t s7, s8, s9, sA;
+
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+ int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+ transpose_concat_4x4(s7, s8, s9, sA, s789A);
+
+ vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+ vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+ vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+ uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filter, max);
+ uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filter, max);
+ uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filter, max);
+ uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filter, max);
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0123[0] = s4567[0];
+ s0123[1] = s4567[1];
+ s1234[0] = s5678[0];
+ s1234[1] = s5678[1];
+ s2345[0] = s6789[0];
+ s2345[1] = s6789[1];
+ s3456[0] = s789A[0];
+ s3456[1] = s789A[1];
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+ transpose_concat_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+ do {
+ int16x8_t s7, s8, s9, sA;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+ int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
+ transpose_concat_8x4(s7, s8, s9, sA, s789A);
+
+ vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+ vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+ vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+ uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filter, max);
+ uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filter, max);
+ uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filter, max);
+ uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filter, max);
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0123[0] = s4567[0];
+ s0123[1] = s4567[1];
+ s0123[2] = s4567[2];
+ s0123[3] = s4567[3];
+ s1234[0] = s5678[0];
+ s1234[1] = s5678[1];
+ s1234[2] = s5678[2];
+ s1234[3] = s5678[3];
+ s2345[0] = s6789[0];
+ s2345[1] = s6789[1];
+ s2345[2] = s6789[2];
+ s2345[3] = s6789[3];
+ s3456[0] = s789A[0];
+ s3456[1] = s789A[1];
+ s3456[2] = s789A[2];
+ s3456[3] = s789A[3];
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+void vpx_highbd_convolve8_vert_sve2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, y0_q4, y_step_q4, w, h, bd);
+ return;
+ }
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+ assert(y_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y0_q4;
+ (void)y_step_q4;
+
+ if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+ vpx_highbd_convolve8_vert_neon(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+ } else {
+ const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+ highbd_convolve8_8tap_vert_sve2(src - 3 * src_stride, src_stride, dst,
+ dst_stride, w, h, y_filter_8tap, bd);
+ }
+}
+
+void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src,
+ ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h, int bd) {
+ if (y_step_q4 != 16) {
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+ x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+ bd);
+ return;
+ }
+
+ assert((intptr_t)dst % 4 == 0);
+ assert(dst_stride % 4 == 0);
+
+ const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+ src -= 3 * src_stride;
+
+ uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
+
+ // Correct indices by the size of vector length.
+ merge_tbl_idx.val[0] = vaddq_u16(
+ merge_tbl_idx.val[0],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
+ merge_tbl_idx.val[1] = vaddq_u16(
+ merge_tbl_idx.val[1],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
+ merge_tbl_idx.val[2] = vaddq_u16(
+ merge_tbl_idx.val[2],
+ vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
+
+ if (w == 4) {
+ const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+ transpose_concat_4x4(s0, s1, s2, s3, s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, s3456);
+
+ do {
+ int16x4_t s7, s8, s9, sA;
+
+ load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+ int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+ transpose_concat_4x4(s7, s8, s9, sA, s789A);
+
+ vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+ vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+ vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+ uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filters, max);
+ uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filters, max);
+ uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filters, max);
+ uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filters, max);
+
+ d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+ d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+ d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+ d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0123[0] = s4567[0];
+ s0123[1] = s4567[1];
+ s1234[0] = s5678[0];
+ s1234[1] = s5678[1];
+ s2345[0] = s6789[0];
+ s2345[1] = s6789[1];
+ s3456[0] = s789A[0];
+ s3456[1] = s789A[1];
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ do {
+ const int16_t *s = (const int16_t *)src;
+ uint16_t *d = dst;
+ int height = h;
+
+ int16x8_t s0, s1, s2, s3, s4, s5, s6;
+ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+ s += 7 * src_stride;
+
+ int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+ transpose_concat_8x4(s0, s1, s2, s3, s0123);
+ transpose_concat_8x4(s1, s2, s3, s4, s1234);
+ transpose_concat_8x4(s2, s3, s4, s5, s2345);
+ transpose_concat_8x4(s3, s4, s5, s6, s3456);
+
+ do {
+ int16x8_t s7, s8, s9, sA;
+ load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+ int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
+ transpose_concat_8x4(s7, s8, s9, sA, s789A);
+
+ vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+ vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+ vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+ uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filters, max);
+ uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filters, max);
+ uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filters, max);
+ uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filters, max);
+
+ d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+ d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+ d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+ d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+ store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0123[0] = s4567[0];
+ s0123[1] = s4567[1];
+ s0123[2] = s4567[2];
+ s0123[3] = s4567[3];
+ s1234[0] = s5678[0];
+ s1234[1] = s5678[1];
+ s1234[2] = s5678[2];
+ s1234[3] = s5678[3];
+ s2345[0] = s6789[0];
+ s2345[1] = s6789[1];
+ s2345[2] = s6789[2];
+ s2345[3] = s6789[3];
+ s3456[0] = s789A[0];
+ s3456[1] = s789A[1];
+ s3456[2] = s789A[2];
+ s3456[3] = s789A[3];
+
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
deleted file mode 100644
index 414ade3530..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
- uint16_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4, int w,
- int h, int bd) {
- // + 1 to make it divisible by 4
- uint16_t temp[64 * 136];
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- /* Filter starting 3 lines back. The neon implementation will ignore the given
- * height and filter a multiple of 4 lines. Since this goes in to the temp
- * buffer which has lots of extra room and is subsequently discarded this is
- * safe if somewhat less than ideal. */
- vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
- filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- intermediate_height, bd);
-
- /* Step into the temp buffer 3 lines to get the actual frame data */
- vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
- uint16_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4,
- int w, int h, int bd) {
- // + 1 to make it divisible by 4
- uint16_t temp[64 * 136];
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- /* This implementation has the same issues as above. In addition, we only want
- * to average the values after both passes.
- */
- vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
- filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- intermediate_height, bd);
- vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
- bd);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
index c54e588239..579096d78a 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -162,7 +162,7 @@ FUN_FLIP_SIGN(16, q_) // flip_sign_16
#define FUN_FLIP_SIGN_BACK(w, r) \
static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
- const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \
+ const int8x##w##_t sign_bit = vdup##r##n_s8((int8_t)0x80); \
return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \
}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
index 38b0b6c1a9..268c4bd962 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
@@ -154,11 +154,10 @@ static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
ptrdiff_t stride) {
uint32_t a;
- uint32x2_t a_u32;
- if (stride == 4) return vld1_u8(buf);
+ uint32x2_t a_u32 = vdup_n_u32(0);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vdup_n_u32(a);
+ a_u32 = vset_lane_u32(a, a_u32, 0);
memcpy(&a, buf, 4);
a_u32 = vset_lane_u32(a, a_u32, 1);
return vreinterpret_u8_u32(a_u32);
@@ -177,11 +176,10 @@ static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
ptrdiff_t stride) {
uint64_t a;
- uint64x2_t a_u64;
- if (stride == 4) return vld1q_u16(buf);
+ uint64x2_t a_u64 = vdupq_n_u64(0);
memcpy(&a, buf, 8);
buf += stride;
- a_u64 = vdupq_n_u64(a);
+ a_u64 = vsetq_lane_u64(a, a_u64, 0);
memcpy(&a, buf, 8);
a_u64 = vsetq_lane_u64(a, a_u64, 1);
return vreinterpretq_u16_u64(a_u64);
@@ -191,10 +189,6 @@ static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
const uint8x8_t a) {
const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
- if (stride == 4) {
- vst1_u8(buf, a);
- return;
- }
uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
buf += stride;
uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
@@ -204,11 +198,10 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
ptrdiff_t stride) {
uint32_t a;
- uint32x4_t a_u32;
- if (stride == 4) return vld1q_u8(buf);
+ uint32x4_t a_u32 = vdupq_n_u32(0);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vdupq_n_u32(a);
+ a_u32 = vsetq_lane_u32(a, a_u32, 0);
memcpy(&a, buf, 4);
buf += stride;
a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -225,10 +218,6 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
const uint8x16_t a) {
const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
- if (stride == 4) {
- vst1q_u8(buf, a);
- return;
- }
uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
buf += stride;
uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
@@ -449,6 +438,142 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
vst1q_u8(s, s7);
}
+static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p,
+ const uint16x4_t s0, const uint16x4_t s1,
+ const uint16x4_t s2) {
+ vst1_u16(s, s0);
+ s += p;
+ vst1_u16(s, s1);
+ s += p;
+ vst1_u16(s, s2);
+}
+
+static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+ int16x4_t *s3) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p,
+ const uint16x4_t s0, const uint16x4_t s1,
+ const uint16x4_t s2, const uint16x4_t s3) {
+ vst1_u16(s, s0);
+ s += p;
+ vst1_u16(s, s1);
+ s += p;
+ vst1_u16(s, s2);
+ s += p;
+ vst1_u16(s, s3);
+}
+
+static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+ int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+ int16x4_t *s6) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+}
+
+static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+ int16x8_t *s3) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+ uint16x8_t *s3) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3) {
+ vst1q_u16(s, s0);
+ s += p;
+ vst1q_u16(s, s1);
+ s += p;
+ vst1q_u16(s, s2);
+ s += p;
+ vst1q_u16(s, s3);
+}
+
+static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2) {
+ vst1q_u16(s, s0);
+ s += p;
+ vst1q_u16(s, s1);
+ s += p;
+ vst1q_u16(s, s2);
+}
+
+static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+ int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+ int16x8_t *s6) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+}
+
static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
@@ -470,4 +595,46 @@ static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
*s7 = vld1q_u16(s);
}
+static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p,
+ int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+ int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+ int16x4_t *s6, int16x4_t *s7) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p,
+ int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+ int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+ int16x8_t *s6, int16x8_t *s7) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+}
+
#endif // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
new file mode 100644
index 0000000000..a18cbbd736
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size) {
+ if (size == 4) {
+ int16x4_t s[4];
+ int64x2_t sum = vdupq_n_s64(0);
+
+ s[0] = vld1_s16(src + 0 * stride);
+ s[1] = vld1_s16(src + 1 * stride);
+ s[2] = vld1_s16(src + 2 * stride);
+ s[3] = vld1_s16(src + 3 * stride);
+
+ int16x8_t s01 = vcombine_s16(s[0], s[1]);
+ int16x8_t s23 = vcombine_s16(s[2], s[3]);
+
+ sum = vpx_dotq_s16(sum, s01, s01);
+ sum = vpx_dotq_s16(sum, s23, s23);
+
+ return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum));
+ } else {
+ int rows = size;
+ int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+ vdupq_n_s64(0) };
+
+ do {
+ const int16_t *src_ptr = src;
+ int cols = size;
+
+ do {
+ int16x8_t s[8];
+ load_s16_8x8(src_ptr, stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+
+ sum[0] = vpx_dotq_s16(sum[0], s[0], s[0]);
+ sum[1] = vpx_dotq_s16(sum[1], s[1], s[1]);
+ sum[2] = vpx_dotq_s16(sum[2], s[2], s[2]);
+ sum[3] = vpx_dotq_s16(sum[3], s[3], s[3]);
+ sum[0] = vpx_dotq_s16(sum[0], s[4], s[4]);
+ sum[1] = vpx_dotq_s16(sum[1], s[5], s[5]);
+ sum[2] = vpx_dotq_s16(sum[2], s[6], s[6]);
+ sum[3] = vpx_dotq_s16(sum[3], s[7], s[7]);
+
+ src_ptr += 8;
+ cols -= 8;
+ } while (cols);
+
+ src += 8 * stride;
+ rows -= 8;
+ } while (rows);
+
+ sum[0] = vaddq_s64(sum[0], sum[1]);
+ sum[2] = vaddq_s64(sum[2], sum[3]);
+ sum[0] = vaddq_s64(sum[0], sum[2]);
+
+ return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum[0]));
+ }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
index 74f85a6bb6..c989a6721b 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -524,12 +524,20 @@ static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
*a7 = vreinterpretq_s32_s64(c3.val[1]);
}
-// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
-// 'q' registers here to save some instructions.
static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
uint8x8_t *a6, uint8x8_t *a7) {
- // Swap 8 bit elements. Goes from:
+ // Widen to 128-bit registers (usually a no-op once inlined.)
+ const uint8x16_t a0q = vcombine_u8(*a0, vdup_n_u8(0));
+ const uint8x16_t a1q = vcombine_u8(*a1, vdup_n_u8(0));
+ const uint8x16_t a2q = vcombine_u8(*a2, vdup_n_u8(0));
+ const uint8x16_t a3q = vcombine_u8(*a3, vdup_n_u8(0));
+ const uint8x16_t a4q = vcombine_u8(*a4, vdup_n_u8(0));
+ const uint8x16_t a5q = vcombine_u8(*a5, vdup_n_u8(0));
+ const uint8x16_t a6q = vcombine_u8(*a6, vdup_n_u8(0));
+ const uint8x16_t a7q = vcombine_u8(*a7, vdup_n_u8(0));
+
+ // Zip 8 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 10 11 12 13 14 15 16 17
// a2: 20 21 22 23 24 25 26 27
@@ -539,43 +547,41 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
// a6: 60 61 62 63 64 65 66 67
// a7: 70 71 72 73 74 75 76 77
// to:
- // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
- // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
- // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
- // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
-
- const uint8x16x2_t b0 =
- vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
- const uint8x16x2_t b1 =
- vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
-
- // Swap 16 bit elements resulting in:
- // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
- // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
- // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
- // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
-
- const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
- vreinterpretq_u16_u8(b1.val[0]));
- const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
- vreinterpretq_u16_u8(b1.val[1]));
-
- // Unzip 32 bit elements resulting in:
+ // b0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // b1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // b2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // b3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const uint8x16_t b0 = vzipq_u8(a0q, a1q).val[0];
+ const uint8x16_t b1 = vzipq_u8(a2q, a3q).val[0];
+ const uint8x16_t b2 = vzipq_u8(a4q, a5q).val[0];
+ const uint8x16_t b3 = vzipq_u8(a6q, a7q).val[0];
+
+ // Zip 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // c0.val[1]: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // c1.val[0]: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // c1.val[1]: 44 54 64 74 45 55 65 75 46 66 56 76 47 67 57 77
+ const uint16x8x2_t c0 =
+ vzipq_u16(vreinterpretq_u16_u8(b0), vreinterpretq_u16_u8(b1));
+ const uint16x8x2_t c1 =
+ vzipq_u16(vreinterpretq_u16_u8(b2), vreinterpretq_u16_u8(b3));
+
+ // Zip 32 bit elements resulting in:
// d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d0.val[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[0]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
// d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
- const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ const uint32x4x2_t d0 = vzipq_u32(vreinterpretq_u32_u16(c0.val[0]),
vreinterpretq_u32_u16(c1.val[0]));
- const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ const uint32x4x2_t d1 = vzipq_u32(vreinterpretq_u32_u16(c0.val[1]),
vreinterpretq_u32_u16(c1.val[1]));
*a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
*a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
- *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
- *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
- *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
- *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *a2 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ *a3 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *a4 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ *a5 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
*a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
*a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index 65fb67c984..037ea1142d 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -20,44 +20,36 @@
#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
-// Note:
-// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
-// 2. After refactoring the shared code in kernel loops with inline functions,
-// the decoder speed dropped a lot when using gcc compiler. Therefore there is
-// no refactoring for those parts by now.
-// 3. For horizontal convolve, there is an alternative optimization that
-// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
-// samples in each are read from memory: src, (src+1), (src+2), (src+3),
-// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
-// instructions. This optimization is much faster in speed unit test, but slowed
-// down the whole decoder by 5%.
-
-static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride, int w,
- int h, const int16x4_t filter) {
+static INLINE void convolve_4tap_horiz_neon(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h,
+ const int16x8_t filter) {
+ // 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const uint8x8_t x_filter =
+ vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
+
+ // Neon does not have lane-referencing multiply or multiply-accumulate
+ // instructions that operate on vectors of 8-bit elements. This means we have
+ // to duplicate filter taps into a whole vector and use standard multiply /
+ // multiply-accumulate instructions.
+ const uint8x8_t filter_taps[4] = { vdup_lane_u8(x_filter, 2),
+ vdup_lane_u8(x_filter, 3),
+ vdup_lane_u8(x_filter, 4),
+ vdup_lane_u8(x_filter, 5) };
+
if (w == 4) {
do {
- int16x4_t s0[4], s1[4];
-
- int16x8_t t0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src)));
- s0[0] = vget_low_s16(vextq_s16(t0, t0, 0));
- s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
- s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
- s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
+ uint8x8_t s01[4];
- int16x8_t t1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + src_stride)));
- s1[0] = vget_low_s16(vextq_s16(t1, t1, 0));
- s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
- s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
- s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
+ s01[0] = load_unaligned_u8(src + 0, src_stride);
+ s01[1] = load_unaligned_u8(src + 1, src_stride);
+ s01[2] = load_unaligned_u8(src + 2, src_stride);
+ s01[3] = load_unaligned_u8(src + 3, src_stride);
- int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
- int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
- uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter_taps);
- store_u8(dst, dst_stride, d01);
+ store_unaligned_u8(dst, dst_stride, d01);
src += 2 * src_stride;
dst += 2 * dst_stride;
@@ -70,25 +62,20 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
int width = w;
do {
- int16x8_t t0[2], t1[2];
- int16x8_t s0[4], s1[4];
-
- t0[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
- t0[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 8)));
- s0[0] = vextq_s16(t0[0], t0[1], 0);
- s0[1] = vextq_s16(t0[0], t0[1], 1);
- s0[2] = vextq_s16(t0[0], t0[1], 2);
- s0[3] = vextq_s16(t0[0], t0[1], 3);
-
- t1[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride)));
- t1[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride + 8)));
- s1[0] = vextq_s16(t1[0], t1[1], 0);
- s1[1] = vextq_s16(t1[0], t1[1], 1);
- s1[2] = vextq_s16(t1[0], t1[1], 2);
- s1[3] = vextq_s16(t1[0], t1[1], 3);
-
- uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
- uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);
+ uint8x8_t s0[4], s1[4];
+
+ s0[0] = vld1_u8(s + 0);
+ s0[1] = vld1_u8(s + 1);
+ s0[2] = vld1_u8(s + 2);
+ s0[3] = vld1_u8(s + 3);
+
+ s1[0] = vld1_u8(s + src_stride + 0);
+ s1[1] = vld1_u8(s + src_stride + 1);
+ s1[2] = vld1_u8(s + src_stride + 2);
+ s1[3] = vld1_u8(s + src_stride + 3);
+
+ uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter_taps);
+ uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter_taps);
vst1_u8(d, d0);
vst1_u8(d + dst_stride, d1);
@@ -103,47 +90,41 @@ static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
}
}
-static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride, int w,
- int h, const int16x8_t filter) {
- uint8x8_t t0, t1, t2, t3;
-
+static INLINE void convolve_8tap_horiz_neon(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h,
+ const int16x8_t filter) {
if (h == 4) {
- uint8x8_t d01, d23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+ uint8x8_t t0, t1, t2, t3;
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
src += 7;
do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ transpose_u8_8x4(&t7, &t8, &t9, &t10);
+ int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+ int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
transpose_u8_4x4(&d01, &d23);
@@ -162,52 +143,33 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
w -= 4;
} while (w != 0);
} else {
- int width;
- const uint8_t *s;
- uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
if (w == 4) {
do {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
&t7);
- src += 8 * src_stride;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
+
transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
- d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
- d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
- d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
transpose_u8_8x4(&d04, &d15, &d26, &d37);
@@ -216,57 +178,53 @@ static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+ src += 8 * src_stride;
dst += 8 * dst_stride;
h -= 8;
} while (h > 0);
} else {
- uint8_t *d;
- uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
- int16x8_t s11, s12, s13, s14;
-
do {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- width = w;
- s = src + 7;
- d = dst;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ const uint8_t *s = src + 7;
+ uint8_t *d = dst;
+ int width = w;
do {
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
- d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
- d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
- d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
- d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
+ uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+ load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+ &t15);
+
+ transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+ uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+ uint8x8_t d6 =
+ convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+ uint8x8_t d7 =
+ convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
@@ -304,17 +262,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
(void)y0_q4;
(void)y_step_q4;
+ const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+
if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements.
- */
- const int16x4_t x_filter_4tap = vshr_n_s16(vld1_s16(filter[x0_q4] + 2), 1);
- vpx_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
- x_filter_4tap);
+ convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+ x_filter);
} else {
- const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
- vpx_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
- x_filter_8tap);
+ convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+ x_filter);
}
}
@@ -324,7 +279,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
const int16x8_t filters = vld1q_s16(filter[x0_q4]);
- uint8x8_t t0, t1, t2, t3;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
@@ -337,48 +291,41 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
src -= 3;
if (h == 4) {
- uint8x8_t d01, d23, dd01, dd23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
+ uint8x8_t t0, t1, t2, t3;
load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
src += 7;
do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+ transpose_u8_8x4(&t7, &t8, &t9, &t10);
+ int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+ int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
transpose_u8_4x4(&d01, &d23);
- dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
- dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -398,61 +345,40 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
w -= 4;
} while (w != 0);
} else {
- int width;
- const uint8_t *s;
- uint8x8_t t4, t5, t6, t7;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
if (w == 4) {
- uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37;
-
do {
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
&t7);
- src += 8 * src_stride;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
+
transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
- d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
transpose_u8_8x4(&d04, &d15, &d26, &d37);
- dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
- dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
- dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
- dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
+ uint8x8_t dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
+ uint8x8_t dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
+ uint8x8_t dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
+ uint8x8_t dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
d04 = vrhadd_u8(d04, dd04);
d15 = vrhadd_u8(d15, dd15);
@@ -464,65 +390,54 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+ src += 8 * src_stride;
dst += 8 * dst_stride;
h -= 8;
} while (h != 0);
} else {
- uint8_t *d;
- uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
- int16x8_t s11, s12, s13, s14;
-
do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
- __builtin_prefetch(src + 7 * src_stride);
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- width = w;
- s = src + 7;
- d = dst;
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(dst + 4 * dst_stride);
- __builtin_prefetch(dst + 5 * dst_stride);
- __builtin_prefetch(dst + 6 * dst_stride);
- __builtin_prefetch(dst + 7 * dst_stride);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ const uint8_t *s = src + 7;
+ uint8_t *d = dst;
+ int width = w;
do {
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
- d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
- d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
- d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+ uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+ load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+ &t15);
+
+ transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+ uint8x8_t d5 =
+ convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+ uint8x8_t d6 =
+ convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+ uint8x8_t d7 =
+ convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
@@ -556,152 +471,37 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
}
}
-static INLINE void vpx_convolve_4tap_vert_neon(const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride, int w,
- int h, const int16x4_t filter) {
- if (w == 4) {
- uint8x8_t t0, t1, t2, t3, d01, d23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, d0, d1, d2, d3;
-
- load_u8_8x3(src, src_stride, &t0, &t1, &t2);
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
-
- src += 3 * src_stride;
-
- do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
-
- d0 = convolve4_4(s0, s1, s2, s3, filter);
- d1 = convolve4_4(s1, s2, s3, s4, filter);
- d2 = convolve4_4(s2, s3, s4, s5, filter);
- d3 = convolve4_4(s3, s4, s5, s6, filter);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- } else {
- int height;
- const uint8_t *s;
- uint8_t *d;
- uint8x8_t t0, t1, t2, t3, d0, d1, d2, d3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6;
-
- do {
- load_u8_8x3(src, src_stride, &t0, &t1, &t2);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-
- s = src + 3 * src_stride;
- d = dst;
- height = h;
-
- do {
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
-
- d0 = convolve4_8(s0, s1, s2, s3, filter);
- d1 = convolve4_8(s1, s2, s3, s4, filter);
- d2 = convolve4_8(s2, s3, s4, s5, filter);
- d3 = convolve4_8(s3, s4, s5, s6, filter);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w != 0);
- }
-}
-
-static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride, int w,
- int h, const int16x8_t filter) {
+static INLINE void convolve_8tap_vert_neon(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h,
+ const int16x8_t filter) {
if (w == 4) {
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
src += 7 * src_stride;
do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
-
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+ int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -718,54 +518,33 @@ static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src,
h -= 4;
} while (h != 0);
} else {
- int height;
- const uint8_t *s;
- uint8_t *d;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- s = src + 7 * src_stride;
- d = dst;
- height = h;
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ const uint8_t *s = src + 7 * src_stride;
+ uint8_t *d = dst;
+ int height = h;
do {
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
-
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+ uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+ uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -800,17 +579,14 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
(void)x_step_q4;
(void)y_step_q4;
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements.
- */
- const int16x4_t y_filter_4tap = vshr_n_s16(vld1_s16(filter[y0_q4] + 2), 1);
- vpx_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride,
- w, h, y_filter_4tap);
+ convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+ y_filter);
} else {
- const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
- vpx_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
- dst_stride, w, h, y_filter_8tap);
+ convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, dst_stride,
+ w, h, y_filter);
}
}
@@ -832,45 +608,35 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
src -= 3 * src_stride;
if (w == 4) {
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23;
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
- s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
- s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
- s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+ int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
src += 7 * src_stride;
do {
- load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
- s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
- s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
- s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
- s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
-
- __builtin_prefetch(dst + 0 * dst_stride);
- __builtin_prefetch(dst + 1 * dst_stride);
- __builtin_prefetch(dst + 2 * dst_stride);
- __builtin_prefetch(dst + 3 * dst_stride);
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
-
- d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
- dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
- dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+ int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+ int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+ int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+ int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+ int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -890,54 +656,33 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
h -= 4;
} while (h != 0);
} else {
- int height;
- const uint8_t *s;
- uint8_t *d;
- uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
do {
- __builtin_prefetch(src + 0 * src_stride);
- __builtin_prefetch(src + 1 * src_stride);
- __builtin_prefetch(src + 2 * src_stride);
- __builtin_prefetch(src + 3 * src_stride);
- __builtin_prefetch(src + 4 * src_stride);
- __builtin_prefetch(src + 5 * src_stride);
- __builtin_prefetch(src + 6 * src_stride);
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- s = src + 7 * src_stride;
- d = dst;
- height = h;
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ const uint8_t *s = src + 7 * src_stride;
+ uint8_t *d = dst;
+ int height = h;
do {
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
- __builtin_prefetch(d + 0 * dst_stride);
- __builtin_prefetch(d + 1 * dst_stride);
- __builtin_prefetch(d + 2 * dst_stride);
- __builtin_prefetch(d + 3 * dst_stride);
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
-
- d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
- d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
- d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
- d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+ uint8x8_t t7, t8, t9, t10;
+ load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+ uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+ uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+ uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
index 4ecaee0f99..10cc761ccd 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -17,360 +17,6 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/vpx_filter.h"
-#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
- ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4,
- int y_step_q4, int w, int h);
-
-static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples,
- const int32x4_t correction,
- const int8x8_t filters) {
- /* Accumulate dot product into 'correction' to account for range clamp. */
- int32x4_t sum = vdotq_lane_s32(correction, samples, filters, 0);
-
- /* Further narrowing and packing is performed by the caller. */
- return vmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve4_4_sdot(const uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16_t permute_tbl) {
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- int8x16_t clamped_samples =
- vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0);
-
- /* Further narrowing and packing is performed by the caller. */
- return vmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo,
- const int8x16_t samples_hi,
- const int32x4_t correction,
- const int8x8_t filters) {
- /* Sample range-clamping and permutation are performed by the caller. */
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- int32x4_t sum0 = vdotq_lane_s32(correction, samples_lo, filters, 0);
- /* Second 4 output values. */
- int32x4_t sum1 = vdotq_lane_s32(correction, samples_hi, filters, 0);
-
- /* Narrow and re-pack. */
- int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- /* We halved the filter values so -1 from right shift. */
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE uint8x8_t convolve4_8_sdot(const uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[2];
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- int32x4_t sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- /* Second 4 output values. */
- int32x4_t sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
-
- /* Narrow and re-pack. */
- int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- /* We halved the filter values so -1 from right shift. */
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
- const int8x16_t samples_hi,
- const int32x4_t correction,
- const int8x8_t filters) {
- /* Sample range-clamping and permutation are performed by the caller. */
- int32x4_t sum;
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
- sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve8_4_sdot(const uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x2_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[2];
- int32x4_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
- const int8x16_t samples0_hi,
- const int8x16_t samples1_lo,
- const int8x16_t samples1_hi,
- const int32x4_t correction,
- const int8x8_t filters) {
- /* Sample range-clamping and permutation are performed by the caller. */
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0);
- sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
- /* Second 4 output values. */
- sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0);
- sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-static INLINE uint8x8_t convolve8_8_sdot(const uint8x16_t samples,
- const int8x8_t filters,
- const int32x4_t correction,
- const uint8x16_t range_limit,
- const uint8x16x3_t permute_tbl) {
- int8x16_t clamped_samples, permuted_samples[3];
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
-
- /* Accumulate dot product into 'correction' to account for range clamp. */
- /* First 4 output values. */
- sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
- sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
- /* Second 4 output values. */
- sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
- sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-
-#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4,
- int w, int h);
-
-static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples,
- const int8x8_t filters) {
- int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0);
-
- /* Further narrowing and packing is performed by the caller. */
- return vmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve4_4_usdot(const uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16_t permute_tbl) {
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
-
- int32x4_t sum =
- vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
-
- /* Further narrowing and packing is performed by the caller. */
- return vmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo,
- const uint8x16_t samples_hi,
- const int8x8_t filters) {
- /* Sample permutation is performed by the caller. */
- /* First 4 output values. */
- int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
- /* Second 4 output values. */
- int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples_hi, filters, 0);
-
- /* Narrow and re-pack. */
- int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- /* We halved the filter values so -1 from right shift. */
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE uint8x8_t convolve4_8_usdot(const uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16x2_t permute_tbl) {
- uint8x16_t permuted_samples[2];
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
- /* First 4 output values. */
- int32x4_t sum0 =
- vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
- /* Second 4 output values. */
- int32x4_t sum1 =
- vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
-
- /* Narrow and re-pack. */
- int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
- /* We halved the filter values so -1 from right shift. */
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
-static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
- const uint8x16_t samples_hi,
- const int8x8_t filters) {
- /* Sample permutation is performed by the caller. */
- int32x4_t sum;
-
- sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
- sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16x2_t permute_tbl) {
- uint8x16_t permuted_samples[2];
- int32x4_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
-
- sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
- sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
-
- /* Further narrowing and packing is performed by the caller. */
- return vqmovn_s32(sum);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
- const uint8x16_t samples0_hi,
- const uint8x16_t samples1_lo,
- const uint8x16_t samples1_hi,
- const int8x8_t filters) {
- /* Sample permutation is performed by the caller. */
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* First 4 output values. */
- sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
- sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
- /* Second 4 output values. */
- sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
- sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples,
- const int8x8_t filters,
- const uint8x16x3_t permute_tbl) {
- uint8x16_t permuted_samples[3];
- int32x4_t sum0, sum1;
- int16x8_t sum;
-
- /* Permute samples ready for dot product. */
- /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */
- permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
- /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */
- permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
- /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
- permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
-
- /* First 4 output values. */
- sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
- sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
- /* Second 4 output values. */
- sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
- sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
-
- /* Narrow and re-pack. */
- sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
- return vqrshrun_n_s16(sum, FILTER_BITS);
-}
-
-#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
-
-static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
- const int16x4_t s2, const int16x4_t s3,
- const int16x4_t filters) {
- int16x4_t sum = vmul_lane_s16(s0, filters, 0);
- sum = vmla_lane_s16(sum, s1, filters, 1);
- sum = vmla_lane_s16(sum, s2, filters, 2);
- sum = vmla_lane_s16(sum, s3, filters, 3);
- return sum;
-}
-
-static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
- const int16x8_t s2, const int16x8_t s3,
- const int16x4_t filters) {
- int16x8_t sum = vmulq_lane_s16(s0, filters, 0);
- sum = vmlaq_lane_s16(sum, s1, filters, 1);
- sum = vmlaq_lane_s16(sum, s2, filters, 2);
- sum = vmlaq_lane_s16(sum, s3, filters, 3);
- /* We halved the filter values so -1 from right shift. */
- return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,
const int16x4_t s4, const int16x4_t s5,
@@ -428,4 +74,99 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
filters);
}
+// 2-tap (bilinear) filter values are always positive, but 4-tap filter values
+// are negative on the outer edges (taps 0 and 3), with taps 1 and 2 having much
+// greater positive values to compensate. To use instructions that operate on
+// 8-bit types we also need the types to be unsigned. Subtracting the products
+// of taps 0 and 3 from the products of taps 1 and 2 always works given that
+// 2-tap filters are 0-padded.
+static INLINE uint8x8_t convolve4_8(const uint8x8_t s0, const uint8x8_t s1,
+ const uint8x8_t s2, const uint8x8_t s3,
+ const uint8x8_t filter_taps[4]) {
+ uint16x8_t sum = vmull_u8(s1, filter_taps[1]);
+ sum = vmlal_u8(sum, s2, filter_taps[2]);
+ sum = vmlsl_u8(sum, s0, filter_taps[0]);
+ sum = vmlsl_u8(sum, s3, filter_taps[3]);
+ // We halved the filter values so -1 from right shift.
+ return vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_BITS - 1);
+}
+
+static INLINE void convolve_4tap_vert_neon(const uint8_t *src,
+ ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h,
+ const int16x8_t filter) {
+ // 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const uint8x8_t y_filter =
+ vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
+
+ // Neon does not have lane-referencing multiply or multiply-accumulate
+ // instructions that operate on vectors of 8-bit elements. This means we have
+ // to duplicate filter taps into a whole vector and use standard multiply /
+ // multiply-accumulate instructions.
+ const uint8x8_t filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+ vdup_lane_u8(y_filter, 3),
+ vdup_lane_u8(y_filter, 4),
+ vdup_lane_u8(y_filter, 5) };
+
+ if (w == 4) {
+ uint8x8_t s01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+ uint8x8_t s12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+
+ src += 2 * src_stride;
+
+ do {
+ uint8x8_t s23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+ uint8x8_t s34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+ uint8x8_t s45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
+ uint8x8_t s56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
+
+ uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter_taps);
+ uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter_taps);
+
+ store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ s01 = s45;
+ s12 = s56;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+
+ uint8x8_t s0, s1, s2;
+ load_u8_8x3(s, src_stride, &s0, &s1, &s2);
+
+ s += 3 * src_stride;
+
+ do {
+ uint8x8_t s3, s4, s5, s6;
+ load_u8_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+ uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter_taps);
+ uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter_taps);
+ uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter_taps);
+ uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter_taps);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
index 00bac3b9cf..b05a49d3fe 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -20,270 +20,139 @@
#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
+// Filter values always sum to 128.
+#define FILTER_SUM 128
+
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
};
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
- 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
- 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
- /* Shift left and insert new last column in transposed 4x4 block. */
+ // Shift left and insert new last column in transposed 4x4 block.
1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
- /* Shift left and insert two new columns in transposed 4x4 block. */
+ // Shift left and insert two new columns in transposed 4x4 block.
2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
- /* Shift left and insert three new columns in transposed 4x4 block. */
+ // Shift left and insert three new columns in transposed 4x4 block.
3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
};
-static INLINE void vpx_convolve_4tap_2d_horiz_neon_dotprod(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x16_t range_limit) {
- uint8x16_t s0, s1, s2, s3;
-
- if (w == 4) {
- const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
- d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8_4x1(dst + 2 * dst_stride, d23);
- } else {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16_t permute_tbl) {
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t samples_128 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
- d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
- store_u8_8x3(d, dst_stride, d0, d1, d2);
+ // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
+ // by 2 since we halved the filter values.)
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
+ int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- }
+ // Further narrowing and packing is performed by the caller.
+ return vmovn_s32(sum);
}
-static INLINE void vpx_convolve_8tap_2d_horiz_neon_dotprod(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x16_t range_limit) {
- uint8x16_t s0, s1, s2, s3;
-
- if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
- d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8_4x1(dst + 2 * dst_stride, d23);
- } else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
- d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
-
- store_u8_8x3(d, dst_stride, d0, d1, d2);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width != 0);
- }
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t samples_128 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+ vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+ // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
+ // by 2 since we halved the filter values.)
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
+ // First 4 output values.
+ int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+ // Second 4 output values.
+ int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+ // We halved the filter values so -1 from right shift.
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
- ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4,
- int y_step_q4, int w, int h) {
- const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
- const int32x4_t correction_8tap =
- vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
- const uint8x16_t range_limit = vdupq_n_u8(128);
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
- assert(x_step_q4 == 16);
-
- (void)x_step_q4;
- (void)y0_q4;
- (void)y_step_q4;
-
- if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
- const int8x8_t x_filter_4tap =
- vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
- const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
- vpx_convolve_4tap_2d_horiz_neon_dotprod(src - 1, src_stride, dst,
- dst_stride, w, h, x_filter_4tap,
- correction_4tap, range_limit);
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t samples_128 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+ vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+ // Accumulate into 128 * FILTER_SUM to account for range transform.
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+ int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+ sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1);
+
+ // Further narrowing and packing is performed by the caller.
+ return vshrn_n_s32(sum, 1);
+}
- } else {
- vpx_convolve_8tap_2d_horiz_neon_dotprod(src - 3, src_stride, dst,
- dst_stride, w, h, x_filter_8tap,
- correction_8tap, range_limit);
- }
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x3_t permute_tbl) {
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x16_t samples_128 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+ vqtbl1q_s8(samples_128, permute_tbl.val[1]),
+ vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
+
+ // Accumulate into 128 * FILTER_SUM to account for range transform.
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+ // First 4 output values.
+ int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+ sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1);
+ // Second 4 output values.
+ int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+ sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
+static INLINE void convolve_4tap_horiz_neon_dotprod(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x16_t range_limit) {
- uint8x16_t s0, s1, s2, s3;
-
+ ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
if (w == 4) {
- const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- t1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- t2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- t3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+ int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+ int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+ int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+ int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -293,23 +162,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+ uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -324,26 +191,22 @@ static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
}
}
-static INLINE void vpx_convolve_8tap_horiz_neon_dotprod(
+static INLINE void convolve_8tap_horiz_neon_dotprod(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x16_t range_limit) {
- uint8x16_t s0, s1, s2, s3;
-
+ ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
- t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
- t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
- t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+ int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
+ int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -353,23 +216,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_dotprod(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
- d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
- d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
- d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -389,11 +250,6 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
- const int32x4_t correction_8tap =
- vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
- const uint8x16_t range_limit = vdupq_n_u8(128);
-
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
assert(x_step_q4 == 16);
@@ -403,21 +259,21 @@ void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
(void)y_step_q4;
if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
+ // Load 4-tap filter into first 4 elements of the vector.
+ // All 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
const int8x8_t x_filter_4tap =
- vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
- const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
- vpx_convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride,
- w, h, x_filter_4tap, correction_4tap,
- range_limit);
+ vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+ convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, w, h,
+ x_filter_4tap);
} else {
- vpx_convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride,
- w, h, x_filter_8tap, correction_8tap,
- range_limit);
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+ convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, w, h,
+ x_filter_8tap);
}
}
@@ -428,10 +284,6 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
int x_step_q4, int y0_q4,
int y_step_q4, int w, int h) {
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
- const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
- const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
- const uint8x16_t range_limit = vdupq_n_u8(128);
- uint8x16_t s0, s1, s2, s3;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
@@ -444,22 +296,21 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
src -= 3;
if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23, dd01, dd23;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
- t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
- t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
- t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+ int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
+ int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
+ int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
+ int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
- dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
- dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -472,24 +323,23 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
h -= 4;
} while (h != 0);
} else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
- d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
- d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
- d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+ uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
+ uint8x8_t dd0, dd1, dd2, dd3;
load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
d0 = vrhadd_u8(d0, dd0);
@@ -511,260 +361,142 @@ void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
}
static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
- int8x8_t a3, int8x16_t *b,
- const uint8x16_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, XX, XX, XX, XX
- * a1: 10, 11, 12, 13, XX, XX, XX, XX
- * a2: 20, 21, 22, 23, XX, XX, XX, XX
- * a3: 30, 31, 32, 33, XX, XX, XX, XX
- *
- * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
- *b = vqtbl2q_s8(samples, permute_tbl);
+ int8x8_t a3, int8x16_t *b) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, XX, XX, XX, XX
+ // a1: 10, 11, 12, 13, XX, XX, XX, XX
+ // a2: 20, 21, 22, 23, XX, XX, XX, XX
+ // a3: 30, 31, 32, 33, XX, XX, XX, XX
+ //
+ // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+ int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+ int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+ int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+ int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+ int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+ int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+ int16x8_t a0123 =
+ vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
+
+ *b = vreinterpretq_s8_s16(a0123);
}
static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
int8x8_t a3, int8x16_t *b0,
- int8x16_t *b1,
- const uint8x16x2_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, 04, 05, 06, 07
- * a1: 10, 11, 12, 13, 14, 15, 16, 17
- * a2: 20, 21, 22, 23, 24, 25, 26, 27
- * a3: 30, 31, 32, 33, 34, 35, 36, 37
- *
- * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
- *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
- *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+ int8x16_t *b1) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, 04, 05, 06, 07
+ // a1: 10, 11, 12, 13, 14, 15, 16, 17
+ // a2: 20, 21, 22, 23, 24, 25, 26, 27
+ // a3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+ int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+ int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+ int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+ int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+ int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
+ int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
+
+ int16x8x2_t a0123 =
+ vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
+
+ *b0 = vreinterpretq_s8_s16(a0123.val[0]);
+ *b1 = vreinterpretq_s8_s16(a0123.val[1]);
}
-static INLINE void vpx_convolve_4tap_vert_neon_dotprod(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x8_t range_limit) {
- const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
- int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int8x16x2_t samples_LUT;
-
- if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- int8x16_t s0123, s1234, s2345, s3456, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- src += 7 * src_stride;
+static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo,
+ const int8x16_t samples_hi,
+ const int8x8_t filters) {
+ // The sample range transform and permutation are performed by the caller.
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ // Accumulate into 128 * FILTER_SUM to account for range transform.
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+ int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0);
+ sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
- do {
- uint8x8_t t7, t8, t9, t10;
- load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
- d0 = convolve4_4_sdot_partial(s0123, correction, filter);
- d1 = convolve4_4_sdot_partial(s1234, correction, filter);
- d2 = convolve4_4_sdot_partial(s2345, correction, filter);
- d3 = convolve4_4_sdot_partial(s3456, correction, filter);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s0123 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- s3456 = s78910;
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- } else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
- do {
- height = h;
- s = src;
- d = dst;
-
- load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
- s += 7 * src_stride;
-
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
-
- do {
- uint8x8_t t7, t8, t9, t10;
- load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
-
- d0 = convolve4_8_sdot_partial(s0123_lo, s0123_hi, correction, filter);
- d1 = convolve4_8_sdot_partial(s1234_lo, s1234_hi, correction, filter);
- d2 = convolve4_8_sdot_partial(s2345_lo, s2345_hi, correction, filter);
- d3 = convolve4_8_sdot_partial(s3456_lo, s3456_hi, correction, filter);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s0123_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- s3456_lo = s78910_lo;
-
- samples_LUT.val[0] = s3456_hi;
- samples_LUT.val[1] = s78910_hi;
- s0123_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- s3456_hi = s78910_hi;
+ // Further narrowing and packing is performed by the caller.
+ return vshrn_n_s32(sum, 1);
+}
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w != 0);
- }
+static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
+ const int8x16_t samples0_hi,
+ const int8x16_t samples1_lo,
+ const int8x16_t samples1_hi,
+ const int8x8_t filters) {
+ // The sample range transform and permutation are performed by the caller.
+
+ // Accumulate into 128 * FILTER_SUM to account for range transform.
+ int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+ // First 4 output values.
+ int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0);
+ sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+ // Second 4 output values.
+ int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0);
+ sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-static INLINE void vpx_convolve_8tap_vert_neon_dotprod(
+static INLINE void convolve_8tap_vert_neon_dotprod(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
- const int32x4_t correction, const uint8x8_t range_limit) {
+ ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
- int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int8x16x2_t samples_LUT;
if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
src += 7 * src_stride;
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+ int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+ int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+ int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+ int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+ int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+ int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ int8x16_t s0123, s1234, s2345, s3456;
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456);
do {
uint8x8_t t7, t8, t9, t10;
-
load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+ int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+ int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+ int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+ int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+ int8x16_t s78910;
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ int8x16x2_t samples_LUT = { { s3456, s78910 } };
+ int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
- d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
- d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
- d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+ int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+ int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+ int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -781,83 +513,70 @@ static INLINE void vpx_convolve_8tap_vert_neon_dotprod(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
do {
- height = h;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
s += 7 * src_stride;
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+ int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+ int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+ int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+ int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+ int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+ int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi;
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
uint8x8_t t7, t8, t9, t10;
-
load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+ int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+ int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+ int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+ int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
+ int8x16_t s78910_lo, s78910_hi;
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+ int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
samples_LUT.val[0] = s3456_hi;
samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- correction, filter);
- d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- correction, filter);
- d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- correction, filter);
- d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- correction, filter);
+ int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ uint8x8_t d0 =
+ convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+ uint8x8_t d1 =
+ convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+ uint8x8_t d2 =
+ convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+ uint8x8_t d3 =
+ convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123_lo = s4567_lo;
s0123_hi = s4567_hi;
s1234_lo = s5678_lo;
@@ -883,11 +602,6 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
- const int32x4_t correction_8tap =
- vdupq_n_s32(vaddlvq_s16(vshll_n_s8(y_filter_8tap, FILTER_BITS)));
- const uint8x8_t range_limit = vdup_n_u8(128);
-
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
assert(y_step_q4 == 16);
@@ -897,20 +611,15 @@ void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
(void)y_step_q4;
if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
- const int8x8_t y_filter_4tap =
- vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
- const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
- vpx_convolve_4tap_vert_neon_dotprod(src - src_stride, src_stride, dst,
- dst_stride, w, h, y_filter_4tap,
- correction_4tap, range_limit);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+ y_filter);
} else {
- vpx_convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
- dst_stride, w, h, y_filter_8tap,
- correction_8tap, range_limit);
+ const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+ convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
+ dst_stride, w, h, y_filter);
}
}
@@ -921,13 +630,7 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
int x_step_q4, int y0_q4,
int y_step_q4, int w, int h) {
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
- const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
- const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
- const uint8x8_t range_limit = vdup_n_u8(128);
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t t0, t1, t2, t3, t4, t5, t6;
- int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- int8x16x2_t samples_LUT;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
@@ -940,59 +643,54 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
src -= 3 * src_stride;
if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23, dd01, dd23;
-
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
src += 7 * src_stride;
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+ int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+ int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+ int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+ int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+ int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+ int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ int8x16_t s0123, s1234, s2345, s3456;
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456);
do {
uint8x8_t t7, t8, t9, t10;
-
load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+ int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+ int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+ int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+ int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+ int8x16_t s78910;
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ int8x16x2_t samples_LUT = { { s3456, s78910 } };
+ int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
- d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
- d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
- d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
+ int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
+ int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
+ int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
- dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
- dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -1000,8 +698,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123 = s4567;
s1234 = s5678;
s2345 = s6789;
@@ -1012,79 +710,67 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
do {
- height = h;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6;
load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
s += 7 * src_stride;
- /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
- s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
- s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
- s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
- s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
- s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
- s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
- s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
+ // Transform sample range to [-128, 127] for 8-bit signed dot product.
+ int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+ int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+ int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+ int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+ int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+ int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+ int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi;
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
uint8x8_t t7, t8, t9, t10;
-
load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
- s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
- s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
- s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
- s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+ int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+ int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+ int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+ int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
+ int8x16_t s78910_lo, s78910_hi;
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+ int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
samples_LUT.val[0] = s3456_hi;
samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- correction, filters);
- d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- correction, filters);
- d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- correction, filters);
- d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- correction, filters);
-
+ int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+ int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+ int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+ uint8x8_t d0 =
+ convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
+ uint8x8_t d1 =
+ convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
+ uint8x8_t d2 =
+ convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
+ uint8x8_t d3 =
+ convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
+
+ uint8x8_t dd0, dd1, dd2, dd3;
load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
d0 = vrhadd_u8(d0, dd0);
@@ -1094,8 +780,8 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123_lo = s4567_lo;
s0123_hi = s4567_hi;
s1234_lo = s5678_lo;
@@ -1115,3 +801,275 @@ void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
} while (w != 0);
}
}
+
+static INLINE void convolve_4tap_2d_neon_dotprod(const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride, int w,
+ int h, const int8x8_t x_filter,
+ const uint8x8_t y_filter) {
+ // Neon does not have lane-referencing multiply or multiply-accumulate
+ // instructions that operate on vectors of 8-bit elements. This means we have
+ // to duplicate filter taps into a whole vector and use standard multiply /
+ // multiply-accumulate instructions.
+ const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+ vdup_lane_u8(y_filter, 3),
+ vdup_lane_u8(y_filter, 4),
+ vdup_lane_u8(y_filter, 5) };
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+ uint8x16_t h_s0, h_s1, h_s2;
+ load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+ int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
+ int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
+ int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
+
+ src += 3 * src_stride;
+
+ do {
+ uint8x16_t h_s3, h_s4, h_s5, h_s6;
+ load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+ int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
+ int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
+ int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
+ int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
+ uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
+ uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
+ uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
+
+ uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
+ uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
+
+ store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ v_s01 = v_s45;
+ v_s12 = v_s56;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+
+ uint8x16_t h_s0, h_s1, h_s2;
+ load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
+
+ uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
+ uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
+ uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
+
+ s += 3 * src_stride;
+
+ do {
+ uint8x16_t h_s3, h_s4, h_s5, h_s6;
+ load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+ uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
+ uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
+ uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
+ uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
+
+ uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
+ uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
+ uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
+ uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void convolve_8tap_2d_horiz_neon_dotprod(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+ int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 3);
+
+ // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+ // below for further details on possible values of block height.
+ uint8x16_t s0, s1, s2;
+ load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+ int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 =
+ vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8_4x1(dst + 2 * dst_stride, d23);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 3);
+
+ // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+ // below for further details on possible values of block height.
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2;
+ load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+
+ store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ }
+}
+
+void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+ const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+ // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+ // lines post both horizontally and vertically.
+ const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+ const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+ if (x_filter_taps == 4 && y_filter_taps == 4) {
+ const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ // 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const int8x8_t x_filter_4tap =
+ vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+ const uint8x8_t y_filter_4tap =
+ vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
+
+ convolve_4tap_2d_neon_dotprod(src - horiz_offset - vert_offset, src_stride,
+ dst, dst_stride, w, h, x_filter_4tap,
+ y_filter_4tap);
+ return;
+ }
+
+ // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+ // maximum buffer size to 64 * (64 + 7).
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+ const int im_stride = 64;
+ const int im_height = h + SUBPEL_TAPS - 1;
+
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+ convolve_8tap_2d_horiz_neon_dotprod(src - horiz_offset - vert_offset,
+ src_stride, im_block, im_stride, w,
+ im_height, x_filter_8tap);
+
+ convolve_8tap_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_8tap);
+}
+
+void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+ const int im_stride = 64;
+
+ // Averaging convolution always uses an 8-tap filter.
+ // Account for the vertical phase needing 3 lines prior and 4 lines post.
+ const int im_height = h + SUBPEL_TAPS - 1;
+ const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+ convolve_8tap_2d_horiz_neon_dotprod(src - offset - offset * src_stride,
+ src_stride, im_block, im_stride, w,
+ im_height, x_filter_8tap);
+
+ vpx_convolve8_avg_vert_neon_dotprod(im_block + offset * im_stride, im_stride,
+ dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
index bcad1dd121..e582004133 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -26,255 +26,112 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
};
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
- 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
- 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
- /* Shift left and insert new last column in transposed 4x4 block. */
+ // Shift left and insert new last column in transposed 4x4 block.
1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
- /* Shift left and insert two new columns in transposed 4x4 block. */
+ // Shift left and insert two new columns in transposed 4x4 block.
2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
- /* Shift left and insert three new columns in transposed 4x4 block. */
+ // Shift left and insert three new columns in transposed 4x4 block.
3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
};
-static INLINE void vpx_convolve_4tap_2d_horiz_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
- uint8x16_t s0, s1, s2, s3;
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16_t permute_tbl) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
- if (w == 4) {
- const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
+ int32x4_t sum =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve4_4_usdot(s0, filter, perm_tbl);
- d1 = convolve4_4_usdot(s1, filter, perm_tbl);
- d2 = convolve4_4_usdot(s2, filter, perm_tbl);
- d3 = convolve4_4_usdot(s3, filter, perm_tbl);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
- d0 = convolve4_4_usdot(s0, filter, perm_tbl);
- d1 = convolve4_4_usdot(s1, filter, perm_tbl);
- d2 = convolve4_4_usdot(s2, filter, perm_tbl);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8_4x1(dst + 2 * dst_stride, d23);
- } else {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve4_8_usdot(s0, filter, perm_tbl);
- d1 = convolve4_8_usdot(s1, filter, perm_tbl);
- d2 = convolve4_8_usdot(s2, filter, perm_tbl);
- d3 = convolve4_8_usdot(s3, filter, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
- d0 = convolve4_8_usdot(s0, filter, perm_tbl);
- d1 = convolve4_8_usdot(s1, filter, perm_tbl);
- d2 = convolve4_8_usdot(s2, filter, perm_tbl);
-
- store_u8_8x3(d, dst_stride, d0, d1, d2);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- }
+ // Further narrowing and packing is performed by the caller.
+ return vmovn_s32(sum);
}
-static INLINE void vpx_convolve_8tap_2d_horiz_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
- uint8x16_t s0, s1, s2, s3;
-
- if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- do {
- load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_4_usdot(s0, filter, perm_tbl);
- d1 = convolve8_4_usdot(s1, filter, perm_tbl);
- d2 = convolve8_4_usdot(s2, filter, perm_tbl);
- d3 = convolve8_4_usdot(s3, filter, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- load_u8_16x3(src, src_stride, &s0, &s1, &s2);
-
- d0 = convolve8_4_usdot(s0, filter, perm_tbl);
- d1 = convolve8_4_usdot(s1, filter, perm_tbl);
- d2 = convolve8_4_usdot(s2, filter, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8_4x1(dst + 2 * dst_stride, d23);
- } else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
-
- do {
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
- d0 = convolve8_8_usdot(s0, filter, perm_tbl);
- d1 = convolve8_8_usdot(s1, filter, perm_tbl);
- d2 = convolve8_8_usdot(s2, filter, perm_tbl);
- d3 = convolve8_8_usdot(s3, filter, perm_tbl);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h > 3);
-
- /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
- * further details on possible values of block height. */
- width = w;
- s = src;
- d = dst;
- do {
- load_u8_16x3(s, src_stride, &s0, &s1, &s2);
-
- d0 = convolve8_8_usdot(s0, filter, perm_tbl);
- d1 = convolve8_8_usdot(s1, filter, perm_tbl);
- d2 = convolve8_8_usdot(s2, filter, perm_tbl);
-
- store_u8_8x3(d, dst_stride, d0, d1, d2);
-
- s += 8;
- d += 8;
- width -= 8;
- } while (width > 0);
- }
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+ vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+ // First 4 output values.
+ int32x4_t sum0 =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ // Second 4 output values.
+ int32x4_t sum1 =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+ // We halved the filter values so -1 from right shift.
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4,
- int w, int h) {
- const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-
- assert((intptr_t)dst % 4 == 0);
- assert(dst_stride % 4 == 0);
- assert(x_step_q4 == 16);
-
- (void)x_step_q4;
- (void)y0_q4;
- (void)y_step_q4;
-
- if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
- const int8x8_t x_filter_4tap =
- vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
- vpx_convolve_4tap_2d_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride,
- w, h, x_filter_4tap);
-
- } else {
- vpx_convolve_8tap_2d_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride,
- w, h, x_filter_8tap);
- }
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x2_t permute_tbl) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+ vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+ int32x4_t sum =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+ // Further narrowing and packing is performed by the caller.
+ return vshrn_n_s32(sum, 1);
}
-static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
- uint8x16_t s0, s1, s2, s3;
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+ const int8x8_t filters,
+ const uint8x16x3_t permute_tbl) {
+ // Permute samples ready for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+ vqtbl1q_u8(samples, permute_tbl.val[1]),
+ vqtbl1q_u8(samples, permute_tbl.val[2]) };
+
+ // First 4 output values.
+ int32x4_t sum0 =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+ // Second 4 output values.
+ int32x4_t sum1 =
+ vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+static INLINE void convolve_4tap_horiz_neon_i8mm(const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride, int w,
+ int h, const int8x8_t filter) {
if (w == 4) {
- const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve4_4_usdot(s0, filter, perm_tbl);
- t1 = convolve4_4_usdot(s1, filter, perm_tbl);
- t2 = convolve4_4_usdot(s2, filter, perm_tbl);
- t3 = convolve4_4_usdot(s3, filter, perm_tbl);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+ int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+ int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+ int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+ int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -284,23 +141,21 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve4_8_usdot(s0, filter, perm_tbl);
- d1 = convolve4_8_usdot(s1, filter, perm_tbl);
- d2 = convolve4_8_usdot(s2, filter, perm_tbl);
- d3 = convolve4_8_usdot(s3, filter, perm_tbl);
+ uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -315,25 +170,24 @@ static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
}
}
-static INLINE void vpx_convolve_8tap_horiz_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
- uint8x16_t s0, s1, s2, s3;
-
+static INLINE void convolve_8tap_horiz_neon_i8mm(const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride, int w,
+ int h, const int8x8_t filter) {
if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_usdot(s0, filter, perm_tbl);
- t1 = convolve8_4_usdot(s1, filter, perm_tbl);
- t2 = convolve8_4_usdot(s2, filter, perm_tbl);
- t3 = convolve8_4_usdot(s3, filter, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+ int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
+ int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
@@ -343,23 +197,21 @@ static INLINE void vpx_convolve_8tap_horiz_neon_i8mm(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3;
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_usdot(s0, filter, perm_tbl);
- d1 = convolve8_8_usdot(s1, filter, perm_tbl);
- d2 = convolve8_8_usdot(s2, filter, perm_tbl);
- d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
@@ -379,8 +231,6 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
-
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
assert(x_step_q4 == 16);
@@ -390,18 +240,21 @@ void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
(void)y_step_q4;
if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
+ // Load 4-tap filter into first 4 elements of the vector.
+ // All 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
const int8x8_t x_filter_4tap =
- vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
- vpx_convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w,
- h, x_filter_4tap);
+ vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+ convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, h,
+ x_filter_4tap);
} else {
- vpx_convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w,
- h, x_filter_8tap);
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+ convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, h,
+ x_filter_8tap);
}
}
@@ -411,7 +264,6 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
- uint8x16_t s0, s1, s2, s3;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
@@ -424,22 +276,21 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
src -= 3;
if (w == 4) {
- const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
- do {
- int16x4_t t0, t1, t2, t3;
- uint8x8_t d01, d23, dd01, dd23;
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+ do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
- t0 = convolve8_4_usdot(s0, filters, perm_tbl);
- t1 = convolve8_4_usdot(s1, filters, perm_tbl);
- t2 = convolve8_4_usdot(s2, filters, perm_tbl);
- t3 = convolve8_4_usdot(s3, filters, perm_tbl);
- d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+ int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
+ int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
+ int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
+ int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
- dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
- dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -452,24 +303,23 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
h -= 4;
} while (h != 0);
} else {
- const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
- const uint8_t *s;
- uint8_t *d;
- int width;
- uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
do {
- width = w;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
do {
+ uint8x16_t s0, s1, s2, s3;
load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
- d0 = convolve8_8_usdot(s0, filters, perm_tbl);
- d1 = convolve8_8_usdot(s1, filters, perm_tbl);
- d2 = convolve8_8_usdot(s2, filters, perm_tbl);
- d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+ uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
+ uint8x8_t dd0, dd1, dd2, dd3;
load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
d0 = vrhadd_u8(d0, dd0);
@@ -492,216 +342,130 @@ void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b,
- const uint8x16_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, XX, XX, XX, XX
- * a1: 10, 11, 12, 13, XX, XX, XX, XX
- * a2: 20, 21, 22, 23, XX, XX, XX, XX
- * a3: 30, 31, 32, 33, XX, XX, XX, XX
- *
- * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
- *b = vqtbl2q_u8(samples, permute_tbl);
+ uint8x16_t *b) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, XX, XX, XX, XX
+ // a1: 10, 11, 12, 13, XX, XX, XX, XX
+ // a2: 20, 21, 22, 23, XX, XX, XX, XX
+ // a3: 30, 31, 32, 33, XX, XX, XX, XX
+ //
+ // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+ uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+ uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+ uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+ uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+ uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+ uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+ uint16x8_t a0123 =
+ vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0];
+
+ *b = vreinterpretq_u8_u16(a0123);
}
static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
uint8x8_t a2, uint8x8_t a3,
- uint8x16_t *b0, uint8x16_t *b1,
- const uint8x16x2_t permute_tbl) {
- /* Transpose 8-bit elements and concatenate result rows as follows:
- * a0: 00, 01, 02, 03, 04, 05, 06, 07
- * a1: 10, 11, 12, 13, 14, 15, 16, 17
- * a2: 20, 21, 22, 23, 24, 25, 26, 27
- * a3: 30, 31, 32, 33, 34, 35, 36, 37
- *
- * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
- * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
- *
- * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
- * as an argument is preferable to loading it directly from memory as this
- * inline helper is called many times from the same parent function.
- */
-
- uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
- *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
- *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+ uint8x16_t *b0, uint8x16_t *b1) {
+ // Transpose 8-bit elements and concatenate result rows as follows:
+ // a0: 00, 01, 02, 03, 04, 05, 06, 07
+ // a1: 10, 11, 12, 13, 14, 15, 16, 17
+ // a2: 20, 21, 22, 23, 24, 25, 26, 27
+ // a3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+ // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+ uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+ uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+ uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+ uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+ uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0];
+ uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0];
+
+ uint16x8x2_t a0123 =
+ vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23));
+
+ *b0 = vreinterpretq_u8_u16(a0123.val[0]);
+ *b1 = vreinterpretq_u8_u16(a0123.val[1]);
}
-static INLINE void vpx_convolve_4tap_vert_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
- const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x16x2_t samples_LUT;
-
- if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- uint8x16_t s0123, s1234, s2345, s3456, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
- load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- src += 7 * src_stride;
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-
- do {
- load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
- d0 = convolve4_4_usdot_partial(s0123, filter);
- d1 = convolve4_4_usdot_partial(s1234, filter);
- d2 = convolve4_4_usdot_partial(s2345, filter);
- d3 = convolve4_4_usdot_partial(s3456, filter);
- /* We halved the filter values so -1 from right shift. */
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
-
- store_u8(dst + 0 * dst_stride, dst_stride, d01);
- store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s0123 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
- s3456 = s78910;
-
- src += 4 * src_stride;
- dst += 4 * dst_stride;
- h -= 4;
- } while (h != 0);
- } else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
- do {
- height = h;
- s = src;
- d = dst;
+static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
+ const uint8x16_t samples_hi,
+ const int8x8_t filters) {
+ // Sample permutation is performed by the caller.
+ int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+ sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
- load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
- s += 7 * src_stride;
-
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
-
- do {
- load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
-
- d0 = convolve4_8_usdot_partial(s0123_lo, s0123_hi, filter);
- d1 = convolve4_8_usdot_partial(s1234_lo, s1234_hi, filter);
- d2 = convolve4_8_usdot_partial(s2345_lo, s2345_hi, filter);
- d3 = convolve4_8_usdot_partial(s3456_lo, s3456_hi, filter);
-
- store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s0123_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
- s3456_lo = s78910_lo;
-
- samples_LUT.val[0] = s3456_hi;
- samples_LUT.val[1] = s78910_hi;
- s0123_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
- s3456_hi = s78910_hi;
+ // Further narrowing and packing is performed by the caller.
+ return vshrn_n_s32(sum, 1);
+}
- s += 4 * src_stride;
- d += 4 * dst_stride;
- height -= 4;
- } while (height != 0);
- src += 8;
- dst += 8;
- w -= 8;
- } while (w != 0);
- }
+static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
+ const uint8x16_t samples0_hi,
+ const uint8x16_t samples1_lo,
+ const uint8x16_t samples1_hi,
+ const int8x8_t filters) {
+ // Sample permutation is performed by the caller.
+
+ // First 4 output values.
+ int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+ sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+ // Second 4 output values.
+ int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+ sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+ // Narrow and re-pack.
+ int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+ return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}
-static INLINE void vpx_convolve_8tap_vert_neon_i8mm(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+static INLINE void convolve_8tap_vert_neon_i8mm(const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride, int w,
+ int h, const int8x8_t filter) {
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x16x2_t samples_LUT;
-
if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23;
-
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6;
load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
src += 7 * src_stride;
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ uint8x16_t s0123, s1234, s2345, s3456;
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456);
do {
+ uint8x8_t s7, s8, s9, s10;
load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+ uint8x16_t s78910;
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+ uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
- d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
- d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
- d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+ int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+ int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+ int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123 = s4567;
s1234 = s5678;
s2345 = s6789;
@@ -712,67 +476,56 @@ static INLINE void vpx_convolve_8tap_vert_neon_i8mm(
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
do {
- height = h;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6;
load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
s += 7 * src_stride;
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi;
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
+ uint8x8_t s7, s8, s9, s10;
load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
+ uint8x16_t s78910_lo, s78910_hi;
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+ uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
samples_LUT.val[0] = s3456_hi;
samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- filter);
- d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- filter);
- d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- filter);
- d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- filter);
+ uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ uint8x8_t d0 =
+ convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+ uint8x8_t d1 =
+ convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+ uint8x8_t d2 =
+ convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+ uint8x8_t d3 =
+ convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123_lo = s4567_lo;
s0123_hi = s4567_hi;
s1234_lo = s5678_lo;
@@ -798,8 +551,6 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
-
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
assert(y_step_q4 == 16);
@@ -809,17 +560,15 @@ void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
(void)y_step_q4;
if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
- /* All 4-tap and bilinear filter values are even, so halve them to reduce
- * intermediate precision requirements. Also slide the filter values so the
- * the 4 taps exist in the first 4 elements of the vector.
- */
- const int8x8_t y_filter_4tap =
- vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
- vpx_convolve_4tap_vert_neon_i8mm(src - src_stride, src_stride, dst,
- dst_stride, w, h, y_filter_4tap);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+ y_filter);
} else {
- vpx_convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
- dst_stride, w, h, y_filter_8tap);
+ const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+ convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
+ dst_stride, w, h, y_filter);
}
}
@@ -830,8 +579,6 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
int w, int h) {
const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
- uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x16x2_t samples_LUT;
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
@@ -844,43 +591,40 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
src -= 3 * src_stride;
if (w == 4) {
- const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
- uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
- int16x4_t d0, d1, d2, d3;
- uint8x8_t d01, d23, dd01, dd23;
-
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6;
load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
src += 7 * src_stride;
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
- transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
- transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
- transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ uint8x16_t s0123, s1234, s2345, s3456;
+ transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+ transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+ transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+ transpose_concat_4x4(s3, s4, s5, s6, &s3456);
do {
+ uint8x8_t s7, s8, s9, s10;
load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
- transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+ uint8x16_t s78910;
+ transpose_concat_4x4(s7, s8, s9, s10, &s78910);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456;
- samples_LUT.val[1] = s78910;
- s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+ uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
- d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
- d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
- d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
- d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
- d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
- d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
+ int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
+ int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
+ int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
- dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
- dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+ uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+ uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
d01 = vrhadd_u8(d01, dd01);
d23 = vrhadd_u8(d23, dd23);
@@ -888,8 +632,8 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
store_u8(dst + 0 * dst_stride, dst_stride, d01);
store_u8(dst + 2 * dst_stride, dst_stride, d23);
- /* Prepare block for next iteration - re-using as much as possible. */
- /* Shuffle everything up four rows. */
+ // Prepare block for next iteration - re-using as much as possible.
+ // Shuffle everything up four rows.
s0123 = s4567;
s1234 = s5678;
s2345 = s6789;
@@ -900,63 +644,53 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
h -= 4;
} while (h != 0);
} else {
- const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
- uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
- s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
- s6789_hi, s78910_lo, s78910_hi;
- uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
- const uint8_t *s;
- uint8_t *d;
- int height;
-
do {
- height = h;
- s = src;
- d = dst;
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+ uint8x8_t s0, s1, s2, s3, s4, s5, s6;
load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
s += 7 * src_stride;
- /* This operation combines a conventional transpose and the sample permute
- * (see horizontal case) required before computing the dot product.
- */
- transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
- tran_concat_tbl);
- transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
- tran_concat_tbl);
+ // This operation combines a conventional transpose and the sample permute
+ // (see horizontal case) required before computing the dot product.
+ uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+ s3456_lo, s3456_hi;
+ transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+ transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+ transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+ transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
do {
+ uint8x8_t s7, s8, s9, s10;
load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
- transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
- tran_concat_tbl);
+ uint8x16_t s78910_lo, s78910_hi;
+ transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
- /* Merge new data into block from previous iteration. */
- samples_LUT.val[0] = s3456_lo;
- samples_LUT.val[1] = s78910_lo;
- s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+ // Merge new data into block from previous iteration.
+ uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+ uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
samples_LUT.val[0] = s3456_hi;
samples_LUT.val[1] = s78910_hi;
- s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
- s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
- s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
- d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
- filters);
- d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
- filters);
- d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
- filters);
- d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
- filters);
-
+ uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+ uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+ uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+ uint8x8_t d0 =
+ convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
+ uint8x8_t d1 =
+ convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
+ uint8x8_t d2 =
+ convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
+ uint8x8_t d3 =
+ convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
+
+ uint8x8_t dd0, dd1, dd2, dd3;
load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
d0 = vrhadd_u8(d0, dd0);
@@ -987,3 +721,275 @@ void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
} while (w != 0);
}
}
+
+static INLINE void convolve_4tap_2d_neon_i8mm(const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride, int w,
+ int h, const int8x8_t x_filter,
+ const uint8x8_t y_filter) {
+ // Neon does not have lane-referencing multiply or multiply-accumulate
+ // instructions that operate on vectors of 8-bit elements. This means we have
+ // to duplicate filter taps into a whole vector and use standard multiply /
+ // multiply-accumulate instructions.
+ const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+ vdup_lane_u8(y_filter, 3),
+ vdup_lane_u8(y_filter, 4),
+ vdup_lane_u8(y_filter, 5) };
+
+ if (w == 4) {
+ const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+ uint8x16_t h_s0, h_s1, h_s2;
+ load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+ int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
+ int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
+ int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+ uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
+
+ src += 3 * src_stride;
+
+ do {
+ uint8x16_t h_s3, h_s4, h_s5, h_s6;
+ load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+ int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
+ int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
+ int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
+ int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
+ // We halved the filter values so -1 from right shift.
+ uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
+ uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
+ uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
+ uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
+
+ uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
+ uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
+
+ store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ v_s01 = v_s45;
+ v_s12 = v_s56;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h != 0);
+ } else {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int height = h;
+
+ uint8x16_t h_s0, h_s1, h_s2;
+ load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
+
+ uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
+ uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
+ uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
+
+ s += 3 * src_stride;
+
+ do {
+ uint8x16_t h_s3, h_s4, h_s5, h_s6;
+ load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+ uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
+ uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
+ uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
+ uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
+
+ uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
+ uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
+ uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
+ uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ v_s0 = v_s4;
+ v_s1 = v_s5;
+ v_s2 = v_s6;
+ s += 4 * src_stride;
+ d += 4 * dst_stride;
+ height -= 4;
+ } while (height != 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w != 0);
+ }
+}
+
+static INLINE void convolve_8tap_2d_horiz_neon_i8mm(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+ if (w == 4) {
+ const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+ int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+ int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 3);
+
+ // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+ // below for further details on possible values of block height.
+ uint8x16_t s0, s1, s2;
+ load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+ int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+ int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+ int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+ uint8x8_t d23 =
+ vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+ store_u8(dst + 0 * dst_stride, dst_stride, d01);
+ store_u8_4x1(dst + 2 * dst_stride, d23);
+ } else {
+ const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+ do {
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2, s3;
+ load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+ uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+ store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 3);
+
+ // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+ // below for further details on possible values of block height.
+ const uint8_t *s = src;
+ uint8_t *d = dst;
+ int width = w;
+
+ do {
+ uint8x16_t s0, s1, s2;
+ load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+ uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+ uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+ uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+
+ store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ }
+}
+
+void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ assert(x_step_q4 == 16);
+ assert(y_step_q4 == 16);
+
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+ const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+ // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+ // lines post both horizontally and vertically.
+ const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+ const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+ if (x_filter_taps == 4 && y_filter_taps == 4) {
+ const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+ const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+ // 4-tap and bilinear filter values are even, so halve them to reduce
+ // intermediate precision requirements.
+ const int8x8_t x_filter_4tap =
+ vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+ const uint8x8_t y_filter_4tap =
+ vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
+
+ convolve_4tap_2d_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+ dst, dst_stride, w, h, x_filter_4tap,
+ y_filter_4tap);
+ return;
+ }
+
+ // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+ // maximum buffer size to 64 * (64 + 7).
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+ const int im_stride = 64;
+ const int im_height = h + SUBPEL_TAPS - 1;
+
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+ const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+ convolve_8tap_2d_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+ im_block, im_stride, w, im_height,
+ x_filter_8tap);
+
+ convolve_8tap_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, w, h,
+ y_filter_8tap);
+}
+
+void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int y0_q4, int y_step_q4, int w,
+ int h) {
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+ const int im_stride = 64;
+
+ // Averaging convolution always uses an 8-tap filter.
+ // Account for the vertical phase needing 3 lines prior and 4 lines post.
+ const int im_height = h + SUBPEL_TAPS - 1;
+ const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
+
+ assert(y_step_q4 == 16);
+ assert(x_step_q4 == 16);
+
+ const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+ convolve_8tap_2d_horiz_neon_i8mm(src - offset - offset * src_stride,
+ src_stride, im_block, im_stride, w,
+ im_height, x_filter_8tap);
+
+ vpx_convolve8_avg_vert_neon_i8mm(im_block + offset * im_stride, im_stride,
+ dst, dst_stride, filter, x0_q4, x_step_q4,
+ y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index 57772ea668..de5fa29471 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -19,31 +19,32 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const InterpKernel *filter,
int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
- * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
- */
- uint8_t temp[64 * 72];
+ // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+ // maximum buffer size to 64 * (64 + 7) (+1 row to make it divisible by 4).
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]);
+ const int im_stride = 64;
const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
- /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
- * and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) */
- const int intermediate_height = h + vert_filter_taps;
+ // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+ // and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.)
+ const int im_height = h + vert_filter_taps;
const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
assert(y_step_q4 == 16);
assert(x_step_q4 == 16);
- /* Filter starting border_offset lines back. The Neon implementation will
- * ignore the given height and filter a multiple of 4 lines. Since this goes
- * in to the temp buffer which has lots of extra room and is subsequently
- * discarded this is safe if somewhat less than ideal. */
- vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, temp,
- w, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- intermediate_height);
+ // Filter starting border_offset rows back. The Neon implementation will
+ // ignore the given height and filter a multiple of 4 lines. Since this goes
+ // into the temporary buffer which has lots of extra room and is subsequently
+ // discarded this is safe if somewhat less than ideal.
+ vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride,
+ im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, im_height);
- /* Step into the temp buffer border_offset lines to get actual frame data. */
- vpx_convolve8_vert_neon(temp + w * border_offset, w, dst, dst_stride, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+ // Step into the temporary buffer border_offset rows to get actual frame data.
+ vpx_convolve8_vert_neon(im_block + im_stride * border_offset, im_stride, dst,
+ dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, h);
}
void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
@@ -51,18 +52,21 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
- uint8_t temp[64 * 72];
- const int intermediate_height = h + 8;
+ DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]);
+ const int im_stride = 64;
+ const int im_height = h + SUBPEL_TAPS;
+ const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
assert(y_step_q4 == 16);
assert(x_step_q4 == 16);
- /* This implementation has the same issues as above. In addition, we only want
- * to average the values after both passes.
- */
- vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- intermediate_height);
- vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
- x_step_q4, y0_q4, y_step_q4, w, h);
+ // This implementation has the same issues as above. In addition, we only want
+ // to average the values after both passes.
+ vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride,
+ im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, im_height);
+
+ vpx_convolve8_avg_vert_neon(im_block + im_stride * border_offset, im_stride,
+ dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+ y_step_q4, w, h);
}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
deleted file mode 100644
index 9d754fde17..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/arm/vpx_convolve8_neon.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4, int w,
- int h) {
- /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
- * maximum buffer size to 64 * (64 + 7). */
- uint8_t temp[64 * 71];
-
- const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
- /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
- * and vert_filter_taps / 2 lines post. */
- const int intermediate_height = h + vert_filter_taps - 1;
- const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
-
- assert(y_step_q4 == 16);
- assert(x_step_q4 == 16);
-
- vpx_convolve8_2d_horiz_neon_dotprod(
- src - src_stride * border_offset, src_stride, temp, w, filter, x0_q4,
- x_step_q4, y0_q4, y_step_q4, w, intermediate_height);
-
- vpx_convolve8_vert_neon_dotprod(temp + w * border_offset, w, dst, dst_stride,
- filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- h);
-}
-
-void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4,
- int w, int h) {
- uint8_t temp[64 * 71];
-
- /* Averaging convolution always uses an 8-tap filter. */
- /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
- const int intermediate_height = h + 7;
-
- assert(y_step_q4 == 16);
- assert(x_step_q4 == 16);
-
- vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w,
- filter, x0_q4, x_step_q4, y0_q4,
- y_step_q4, w, intermediate_height);
-
- vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
deleted file mode 100644
index d7cbb09ea6..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/arm/vpx_convolve8_neon.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4, int w,
- int h) {
- /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
- * maximum buffer size to 64 * (64 + 7). */
- uint8_t temp[64 * 71];
-
- const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
- /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
- * and vert_filter_taps / 2 lines post. */
- const int intermediate_height = h + vert_filter_taps - 1;
- const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
-
- assert(y_step_q4 == 16);
- assert(x_step_q4 == 16);
-
- vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * border_offset, src_stride,
- temp, w, filter, x0_q4, x_step_q4, y0_q4,
- y_step_q4, w, intermediate_height);
-
- vpx_convolve8_vert_neon_i8mm(temp + w * border_offset, w, dst, dst_stride,
- filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
- h);
-}
-
-void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *filter, int x0_q4,
- int x_step_q4, int y0_q4, int y_step_q4, int w,
- int h) {
- uint8_t temp[64 * 71];
-
- /* Averaging convolution always uses an 8-tap filter. */
- /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
- const int intermediate_height = h + 7;
-
- assert(y_step_q4 == 16);
- assert(x_step_q4 == 16);
-
- vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w,
- filter, x0_q4, x_step_q4, y0_q4, y_step_q4,
- w, intermediate_height);
-
- vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter,
- x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
new file mode 100644
index 0000000000..bf9f18c7e6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
+#define VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// Some very useful instructions are exclusive to the SVE2 instruction set.
+// However, we can access these instructions from a predominantly Neon context
+// by making use of the Neon-SVE bridge intrinsics to reinterpret Neon vectors
+// as SVE vectors - with the high part of the SVE vector (if it's longer than
+// 128 bits) being "don't care".
+
+static INLINE int16x8_t vpx_tbl2_s16(int16x8_t s0, int16x8_t s1,
+ uint16x8_t tbl) {
+ svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0),
+ svset_neonq_s16(svundef_s16(), s1));
+ return svget_neonq_s16(
+ svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl)));
+}
+
+#endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
new file mode 100644
index 0000000000..48534fb70e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
+#define VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// Dot product instructions operating on 16-bit input elements are exclusive to
+// the SVE instruction set. However, we can access these instructions from a
+// predominantly Neon context by making use of the Neon-SVE bridge intrinsics
+// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE
+// vector (if it's longer than 128 bits) being "don't care".
+
+// While sub-optimal on machines that have SVE vector length > 128-bit - as the
+// remainder of the vector is unused - this approach is still beneficial when
+// compared to a Neon-only solution.
+
+static INLINE uint64x2_t vpx_dotq_u16(uint64x2_t acc, uint16x8_t x,
+ uint16x8_t y) {
+ return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+ svset_neonq_u16(svundef_u16(), x),
+ svset_neonq_u16(svundef_u16(), y)));
+}
+
+static INLINE int64x2_t vpx_dotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+ return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+ svset_neonq_s16(svundef_s16(), x),
+ svset_neonq_s16(svundef_s16(), y)));
+}
+
+#define vpx_dotq_lane_s16(acc, x, y, lane) \
+ svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), acc), \
+ svset_neonq_s16(svundef_s16(), x), \
+ svset_neonq_s16(svundef_s16(), y), lane))
+
+static INLINE uint16x8_t vpx_tbl_u16(uint16x8_t data, uint16x8_t indices) {
+ return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), data),
+ svset_neonq_u16(svundef_u16(), indices)));
+}
+
+#endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
index b8e3c5e540..9bd5ec285c 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -20,263 +20,271 @@
#include "vpx_dsp/arm/vpx_convolve8_neon.h"
#include "vpx_ports/mem.h"
-static INLINE void scaledconvolve_horiz_w4(
+static INLINE void scaledconvolve_horiz_neon(
const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
- const int x0_q4, const int x_step_q4, const int w, const int h) {
- DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
- int x, y, z;
+ const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
+ const int x0_q4, const int x_step_q4, int w, int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
src -= SUBPEL_TAPS / 2 - 1;
- y = h;
- do {
- int x_q4 = x0_q4;
- x = 0;
+ if (w == 4) {
do {
- // process 4 src_x steps
- for (z = 0; z < 4; ++z) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ int x_q4 = x0_q4;
+
+ // Process a 4x4 tile.
+ for (int r = 0; r < 4; ++r) {
+ const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+
if (x_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
- uint8x8_t s[8], d;
- int16x8_t ss[4];
- int16x4_t t[8], tt;
-
- load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
- transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
-
- ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
- ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
- ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
- ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
- t[0] = vget_low_s16(ss[0]);
- t[1] = vget_low_s16(ss[1]);
- t[2] = vget_low_s16(ss[2]);
- t[3] = vget_low_s16(ss[3]);
- t[4] = vget_high_s16(ss[0]);
- t[5] = vget_high_s16(ss[1]);
- t[6] = vget_high_s16(ss[2]);
- t[7] = vget_high_s16(ss[3]);
-
- tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
- filters);
- d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
- vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+ const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]);
+
+ uint8x8_t t0, t1, t2, t3;
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ uint8x8_t d0 =
+ vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS);
+
+ store_u8_4x1(&temp[4 * r], d0);
} else {
- int i;
- for (i = 0; i < 4; ++i) {
- temp[z * 4 + i] = src_x[i * src_stride + 3];
+ // Memcpy for non-subpel locations.
+ s += SUBPEL_TAPS / 2 - 1;
+
+ for (int c = 0; c < 4; ++c) {
+ temp[r * 4 + c] = s[c * src_stride];
}
}
x_q4 += x_step_q4;
}
- // transpose the 4x4 filters values back to dst
- {
- const uint8x8x4_t d4 = vld4_u8(temp);
- vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
- vreinterpret_u32_u8(d4.val[0]), 0);
- vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
- vreinterpret_u32_u8(d4.val[1]), 0);
- vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
- vreinterpret_u32_u8(d4.val[2]), 0);
- vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
- vreinterpret_u32_u8(d4.val[3]), 0);
- }
- x += 4;
- } while (x < w);
+ // Transpose the 4x4 result tile and store.
+ uint8x8_t d01 = vld1_u8(temp + 0);
+ uint8x8_t d23 = vld1_u8(temp + 8);
- src += src_stride * 4;
- dst += dst_stride * 4;
- y -= 4;
- } while (y > 0);
-}
+ transpose_u8_4x4(&d01, &d23);
-static INLINE void scaledconvolve_horiz_w8(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
- const int x0_q4, const int x_step_q4, const int w, const int h) {
- DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
- int x, y, z;
- src -= SUBPEL_TAPS / 2 - 1;
+ store_u8_4x1(dst + 0 * dst_stride, d01);
+ store_u8_4x1(dst + 1 * dst_stride, d23);
+ store_u8_4x1_high(dst + 2 * dst_stride, d01);
+ store_u8_4x1_high(dst + 3 * dst_stride, d23);
- // This function processes 8x8 areas. The intermediate height is not always
- // a multiple of 8, so force it to be a multiple of 8 here.
- y = (h + 7) & ~7;
+ src += 4 * src_stride;
+ dst += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ return;
+ }
do {
int x_q4 = x0_q4;
- x = 0;
+ uint8_t *d = dst;
+ int width = w;
+
do {
- uint8x8_t d[8];
- // process 8 src_x steps
- for (z = 0; z < 8; ++z) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ // Process an 8x8 tile.
+ for (int r = 0; r < 8; ++r) {
+ const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
if (x_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
- uint8x8_t s[8];
- load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
- &s[5], &s[6], &s[7]);
- transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
- &s[7]);
- d[0] = scale_filter_8(s, filters);
- vst1_u8(&temp[8 * z], d[0]);
+ const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+ vst1_u8(&temp[r * 8], d0);
} else {
- int i;
- for (i = 0; i < 8; ++i) {
- temp[z * 8 + i] = src_x[i * src_stride + 3];
+ // Memcpy for non-subpel locations.
+ s += SUBPEL_TAPS / 2 - 1;
+
+ for (int c = 0; c < 8; ++c) {
+ temp[r * 8 + c] = s[c * src_stride];
}
}
x_q4 += x_step_q4;
}
- // transpose the 8x8 filters values back to dst
- load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
- &d[7]);
- transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
- vst1_u8(&dst[x + 0 * dst_stride], d[0]);
- vst1_u8(&dst[x + 1 * dst_stride], d[1]);
- vst1_u8(&dst[x + 2 * dst_stride], d[2]);
- vst1_u8(&dst[x + 3 * dst_stride], d[3]);
- vst1_u8(&dst[x + 4 * dst_stride], d[4]);
- vst1_u8(&dst[x + 5 * dst_stride], d[5]);
- vst1_u8(&dst[x + 6 * dst_stride], d[6]);
- vst1_u8(&dst[x + 7 * dst_stride], d[7]);
- x += 8;
- } while (x < w);
-
- src += src_stride * 8;
- dst += dst_stride * 8;
- } while (y -= 8);
-}
+ // Transpose the 8x8 result tile and store.
+ uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+ load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-static INLINE void scaledconvolve_vert_w4(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
- const int y0_q4, const int y_step_q4, const int w, const int h) {
- int y;
- int y_q4 = y0_q4;
+ transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- y = h;
- do {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
- if (y_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- uint8x8_t s[8], d;
- int16x4_t t[8], tt;
-
- load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
- &s[6], &s[7]);
- t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
- t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
- t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
- t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
- t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
- t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
- t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
- t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
-
- tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
- d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
- } else {
- memcpy(dst, &src_y[3 * src_stride], w);
- }
+ d += 8;
+ width -= 8;
+ } while (width != 0);
- dst += dst_stride;
- y_q4 += y_step_q4;
- } while (--y);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
}
-static INLINE void scaledconvolve_vert_w8(
+static INLINE void scaledconvolve_vert_neon(
const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
- const int y0_q4, const int y_step_q4, const int w, const int h) {
- int y;
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
+ const int y0_q4, const int y_step_q4, int w, int h) {
int y_q4 = y0_q4;
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- y = h;
- do {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- if (y_q4 & SUBPEL_MASK) {
- const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- uint8x8_t s[8], d;
- load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
- &s[6], &s[7]);
- d = scale_filter_8(s, filters);
- vst1_u8(dst, d);
- } else {
- memcpy(dst, &src_y[3 * src_stride], w);
- }
- dst += dst_stride;
- y_q4 += y_step_q4;
- } while (--y);
-}
+ if (w == 4) {
+ do {
+ const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-static INLINE void scaledconvolve_vert_w16(
- const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
- const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
- const int y0_q4, const int y_step_q4, const int w, const int h) {
- int x, y;
- int y_q4 = y0_q4;
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+ int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+ int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+ int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+
+ int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+ uint8x8_t d0 =
+ vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS);
+
+ store_u8_4x1(dst, d0);
+ } else {
+ // Memcpy for non-subpel locations.
+ memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4);
+ }
+
+ y_q4 += y_step_q4;
+ dst += dst_stride;
+ } while (--h != 0);
+ return;
+ }
+
+ if (w == 8) {
+ do {
+ const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+ if (y_q4 & SUBPEL_MASK) {
+ const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+ vst1_u8(dst, d0);
+ } else {
+ // Memcpy for non-subpel locations.
+ memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8);
+ }
+
+ y_q4 += y_step_q4;
+ dst += dst_stride;
+ } while (--h != 0);
+ return;
+ }
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- y = h;
do {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ uint8_t *d = dst;
+ int width = w;
+
if (y_q4 & SUBPEL_MASK) {
- x = 0;
do {
- const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
- uint8x16_t ss[8];
- uint8x8_t s[8], d[2];
- load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
- &ss[5], &ss[6], &ss[7]);
- s[0] = vget_low_u8(ss[0]);
- s[1] = vget_low_u8(ss[1]);
- s[2] = vget_low_u8(ss[2]);
- s[3] = vget_low_u8(ss[3]);
- s[4] = vget_low_u8(ss[4]);
- s[5] = vget_low_u8(ss[5]);
- s[6] = vget_low_u8(ss[6]);
- s[7] = vget_low_u8(ss[7]);
- d[0] = scale_filter_8(s, filters);
-
- s[0] = vget_high_u8(ss[0]);
- s[1] = vget_high_u8(ss[1]);
- s[2] = vget_high_u8(ss[2]);
- s[3] = vget_high_u8(ss[3]);
- s[4] = vget_high_u8(ss[4]);
- s[5] = vget_high_u8(ss[5]);
- s[6] = vget_high_u8(ss[6]);
- s[7] = vget_high_u8(ss[7]);
- d[1] = scale_filter_8(s, filters);
- vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
- src_y += 16;
- x += 16;
- } while (x < w);
+ const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+ uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+ s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+ s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+ s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+ s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+ s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4)));
+ s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5)));
+ s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6)));
+ s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7)));
+
+ s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+ s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+ s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+ s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+ s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4)));
+ s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5)));
+ s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6)));
+ s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7)));
+
+ uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
+ s6[0], s7[0], filter);
+ uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
+ s6[1], s7[1], filter);
+
+ vst1q_u8(d, vcombine_u8(d0, d1));
+
+ s += 16;
+ d += 16;
+ width -= 16;
+ } while (width != 0);
} else {
- memcpy(dst, &src_y[3 * src_stride], w);
+ // Memcpy for non-subpel locations.
+ s += (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+ do {
+ uint8x16_t s0 = vld1q_u8(s);
+ vst1q_u8(d, s0);
+ s += 16;
+ d += 16;
+ width -= 16;
+ } while (width != 0);
}
- dst += dst_stride;
+
y_q4 += y_step_q4;
- } while (--y);
+ dst += dst_stride;
+ } while (--h != 0);
}
void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const InterpKernel *filter,
int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
int w, int h) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // Fixed size intermediate buffer, im_block, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
// (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
+ // Deriving the maximum number of rows in the im_block buffer (135):
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
// --Largest block size is 64x64 pixels.
// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
@@ -288,33 +296,20 @@ void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
// When calling in frame scaling function, the smallest scaling factor is x1/4
// ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
// big enough.
- DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
- const int intermediate_height =
+ DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]);
+ const int im_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+ const ptrdiff_t im_stride = 64;
assert(w <= 64);
assert(h <= 64);
assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
assert(x_step_q4 <= 64);
- if (w >= 8) {
- scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
- intermediate_height);
- } else {
- scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
- intermediate_height);
- }
+ scaledconvolve_horiz_neon(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, im_block, im_stride, filter, x0_q4,
+ x_step_q4, w, im_height);
- if (w >= 16) {
- scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, filter, y0_q4, y_step_q4, w, h);
- } else if (w == 8) {
- scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, filter, y0_q4, y_step_q4, w, h);
- } else {
- scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
- dst_stride, filter, y0_q4, y_step_q4, w, h);
- }
+ scaledconvolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4,
+ y_step_q4, w, h);
}
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
index 2bee91f449..916dc62cef 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
@@ -112,7 +112,8 @@ DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c
-DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c
+DSP_SRCS-$(HAVE_SVE) += arm/highbd_vpx_convolve8_sve.c
+DSP_SRCS-$(HAVE_SVE2) += arm/highbd_vpx_convolve8_sve2.c
endif
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm
@@ -139,9 +140,7 @@ DSP_SRCS-yes += arm/vpx_convolve8_neon.c
DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
DSP_SRCS-yes += arm/vpx_convolve_neon.c
DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c
-DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c
DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c
-DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
@@ -374,6 +373,7 @@ DSP_SRCS-yes += sad.c
DSP_SRCS-yes += subtract.c
DSP_SRCS-yes += sum_squares.c
DSP_SRCS-$(HAVE_NEON) += arm/sum_squares_neon.c
+DSP_SRCS-$(HAVE_SVE) += arm/sum_squares_sve.c
DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c
@@ -454,6 +454,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_sse_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/highbd_variance_neon_dotprod.c
+DSP_SRCS-$(HAVE_SVE) += arm/highbd_variance_sve.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
index 030c456d39..2b8c656afb 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
@@ -12,4 +12,4 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/vpx_once.h"
-void vpx_dsp_rtcd() { once(setup_rtcd_internal); }
+void vpx_dsp_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 18087e25d9..f40f85c036 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -427,19 +427,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
- specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
+ specialize qw/vpx_highbd_convolve8_horiz avx2 neon sve/, "$sse2_x86_64";
add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
- specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
+ specialize qw/vpx_highbd_convolve8_vert avx2 neon sve2/, "$sse2_x86_64";
add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
- specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
+ specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon sve/, "$sse2_x86_64";
add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
- specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
+ specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon sve2/, "$sse2_x86_64";
} # CONFIG_VP9_HIGHBITDEPTH
if (vpx_config("CONFIG_VP9") eq "yes") {
@@ -1009,7 +1009,7 @@ add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride,
specialize qw/vpx_sad_skip_4x4x4d neon/;
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
-specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
+specialize qw/vpx_sum_squares_2d_i16 neon sve sse2 msa/;
#
# Structured Similarity (SSIM)
@@ -1411,163 +1411,163 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance64x64 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance64x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance64x32 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance64x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x64 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance32x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x32 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance32x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance32x16 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance32x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x32 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance16x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x16 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance16x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance16x8 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance16x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance8x16 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance8x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance8x8 sse2 neon/;
+ specialize qw/vpx_highbd_12_variance8x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance8x4 neon/;
+ specialize qw/vpx_highbd_12_variance8x4 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance4x8 neon/;
+ specialize qw/vpx_highbd_12_variance4x8 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_variance4x4 neon/;
+ specialize qw/vpx_highbd_12_variance4x4 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance64x64 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance64x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance64x32 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance64x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x64 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance32x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x32 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance32x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance32x16 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance32x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x32 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance16x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x16 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance16x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance16x8 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance16x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance8x16 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance8x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance8x8 sse2 neon/;
+ specialize qw/vpx_highbd_10_variance8x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance8x4 neon/;
+ specialize qw/vpx_highbd_10_variance8x4 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance4x8 neon/;
+ specialize qw/vpx_highbd_10_variance4x8 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_variance4x4 neon/;
+ specialize qw/vpx_highbd_10_variance4x4 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance64x64 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance64x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance64x32 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance64x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x64 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance32x64 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x32 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance32x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance32x16 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance32x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x32 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance16x32 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x16 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance16x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance16x8 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance16x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance8x16 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance8x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance8x8 sse2 neon/;
+ specialize qw/vpx_highbd_8_variance8x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance8x4 neon/;
+ specialize qw/vpx_highbd_8_variance8x4 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance4x8 neon/;
+ specialize qw/vpx_highbd_8_variance4x8 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_variance4x4 neon/;
+ specialize qw/vpx_highbd_8_variance4x4 neon sve/;
add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_8_get16x16var sse2 neon/;
+ specialize qw/vpx_highbd_8_get16x16var sse2 neon sve/;
add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_8_get8x8var sse2 neon/;
+ specialize qw/vpx_highbd_8_get8x8var sse2 neon sve/;
add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_10_get16x16var sse2 neon/;
+ specialize qw/vpx_highbd_10_get16x16var sse2 neon sve/;
add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_10_get8x8var sse2 neon/;
+ specialize qw/vpx_highbd_10_get8x8var sse2 neon sve/;
add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_12_get16x16var sse2 neon/;
+ specialize qw/vpx_highbd_12_get16x16var sse2 neon sve/;
add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
+ specialize qw/vpx_highbd_12_get8x8var sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
+ specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/;
add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse16x8 neon/;
+ specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/;
add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse8x16 neon/;
+ specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/;
add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
+ specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/;
add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
+ specialize qw/vpx_highbd_10_mse16x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse16x8 neon/;
+ specialize qw/vpx_highbd_10_mse16x8 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse8x16 neon/;
+ specialize qw/vpx_highbd_10_mse8x16 neon sve/;
add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_10_mse8x8 sse2 neon/;
+ specialize qw/vpx_highbd_10_mse8x8 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse16x16 sse2 neon/;
+ specialize qw/vpx_highbd_12_mse16x16 sse2 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse16x8 neon/;
+ specialize qw/vpx_highbd_12_mse16x8 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse8x16 neon/;
+ specialize qw/vpx_highbd_12_mse8x16 neon sve/;
add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
+ specialize qw/vpx_highbd_12_mse8x8 sse2 neon sve/;
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
index 0cddcb6991..eb8ff06cd7 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
@@ -28,7 +28,6 @@ extern "C" {
typedef int16_t InterpKernel[SUBPEL_TAPS];
static INLINE int vpx_get_filter_taps(const int16_t *const filter) {
- assert(filter[3] != 128);
if (filter[0] | filter[7]) {
return 8;
}
diff --git a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
index 539d09bb39..eba12d312a 100644
--- a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
+++ b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
@@ -15,7 +15,7 @@
#include <sys/sysctl.h>
#endif
-#if !CONFIG_RUNTIME_CPU_DETECT
+#if !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
static int arm_get_cpu_caps(void) {
// This function should actually be a no-op. There is no way to adjust any of
@@ -28,7 +28,7 @@ static int arm_get_cpu_caps(void) {
return flags;
}
-#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT
+#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT || defined(__OpenBSD__)
// sysctlbyname() parameter documentation for instruction set characteristics:
// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
@@ -99,14 +99,17 @@ static int arm_get_cpu_caps(void) {
// hwcap values are not defined should not prevent features from being enabled.
#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20)
#define VPX_AARCH64_HWCAP_SVE (1 << 22)
+#define VPX_AARCH64_HWCAP2_SVE2 (1 << 1)
#define VPX_AARCH64_HWCAP2_I8MM (1 << 13)
static int arm_get_cpu_caps(void) {
int flags = 0;
+#if HAVE_NEON_DOTPROD || HAVE_SVE
unsigned long hwcap = getauxval(AT_HWCAP);
-#if HAVE_NEON_I8MM
+#endif // HAVE_NEON_DOTPROD || HAVE_SVE
+#if HAVE_NEON_I8MM || HAVE_SVE2
unsigned long hwcap2 = getauxval(AT_HWCAP2);
-#endif // HAVE_NEON_I8MM
+#endif // HAVE_NEON_I8MM || HAVE_SVE2
#if HAVE_NEON
flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A.
#endif // HAVE_NEON
@@ -125,6 +128,11 @@ static int arm_get_cpu_caps(void) {
flags |= HAS_SVE;
}
#endif // HAVE_SVE
+#if HAVE_SVE2
+ if (hwcap2 & VPX_AARCH64_HWCAP2_SVE2) {
+ flags |= HAS_SVE2;
+ }
+#endif // HAVE_SVE2
return flags;
}
@@ -195,5 +203,10 @@ int arm_cpu_caps(void) {
flags &= ~HAS_SVE;
}
+ // Restrict flags: FEAT_SVE2 assumes that FEAT_SVE is available.
+ if (!(flags & HAS_SVE)) {
+ flags &= ~HAS_SVE2;
+ }
+
return flags;
}
diff --git a/media/libvpx/libvpx/vpx_ports/arm.h b/media/libvpx/libvpx/vpx_ports/arm.h
index 39365d18ee..814c3cc408 100644
--- a/media/libvpx/libvpx/vpx_ports/arm.h
+++ b/media/libvpx/libvpx/vpx_ports/arm.h
@@ -25,6 +25,8 @@ extern "C" {
#define HAS_NEON_I8MM (1 << 2)
// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
#define HAS_SVE (1 << 3)
+// Armv9.0-A SVE2 instructions.
+#define HAS_SVE2 (1 << 4)
int arm_cpu_caps(void);
diff --git a/media/libvpx/libvpx/vpx_ports/emms_mmx.c b/media/libvpx/libvpx/vpx_ports/emms_mmx.c
index f1036b98ed..79b98a75f1 100644
--- a/media/libvpx/libvpx/vpx_ports/emms_mmx.c
+++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.c
@@ -12,4 +12,4 @@
#include "vpx_ports/system_state.h"
-void vpx_clear_system_state() { _mm_empty(); }
+void vpx_clear_system_state(void) { _mm_empty(); }
diff --git a/media/libvpx/libvpx/vpx_ports/mem.h b/media/libvpx/libvpx/vpx_ports/mem.h
index 5eccfe8f50..ee9e095633 100644
--- a/media/libvpx/libvpx/vpx_ports/mem.h
+++ b/media/libvpx/libvpx/vpx_ports/mem.h
@@ -23,7 +23,13 @@
#define DECLARE_ALIGNED(n, typ, val) typ val
#endif
-#if HAVE_NEON && defined(_MSC_VER)
+#if defined(__has_builtin)
+#define VPX_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define VPX_HAS_BUILTIN(x) 0
+#endif
+
+#if !VPX_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__)
#define __builtin_prefetch(x)
#endif
diff --git a/media/libvpx/libvpx/vpx_ports/vpx_once.h b/media/libvpx/libvpx/vpx_ports/vpx_once.h
index d8a8ed89fe..d33eff4397 100644
--- a/media/libvpx/libvpx/vpx_ports/vpx_once.h
+++ b/media/libvpx/libvpx/vpx_ports/vpx_once.h
@@ -91,29 +91,6 @@ static void once(void (*func)(void)) {
return;
}
-#elif CONFIG_MULTITHREAD && defined(__OS2__)
-#define INCL_DOS
-#include <os2.h>
-static void once(void (*func)(void)) {
- static volatile int done;
-
- /* If the initialization is complete, return early. */
- if (done) return;
-
- /* Causes all other threads in the process to block themselves
- * and give up their time slice.
- */
- DosEnterCritSec();
-
- if (!done) {
- func();
- done = 1;
- }
-
- /* Restores normal thread dispatching for the current process. */
- DosExitCritSec();
-}
-
#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
#include <pthread.h>
static void once(void (*func)(void)) {
diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
index dc4d9593a8..706b0770c8 100644
--- a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
+++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
@@ -12,4 +12,4 @@
#include "./vpx_scale_rtcd.h"
#include "vpx_ports/vpx_once.h"
-void vpx_scale_rtcd() { once(setup_rtcd_internal); }
+void vpx_scale_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_util/vpx_pthread.h b/media/libvpx/libvpx/vpx_util/vpx_pthread.h
new file mode 100644
index 0000000000..cdd18d0f30
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_pthread.h
@@ -0,0 +1,157 @@
+// Copyright 2024 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// pthread.h wrapper
+
+#ifndef VPX_VPX_UTIL_VPX_PTHREAD_H_
+#define VPX_VPX_UTIL_VPX_PTHREAD_H_
+
+#include "./vpx_config.h"
+
+#if CONFIG_MULTITHREAD
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+// Prevent leaking max/min macros.
+#undef NOMINMAX
+#define NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <errno.h> // NOLINT
+#include <process.h> // NOLINT
+#include <stddef.h> // NOLINT
+#include <windows.h> // NOLINT
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+
+#if _WIN32_WINNT < 0x0600
+#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
+#endif
+typedef CONDITION_VARIABLE pthread_cond_t;
+
+#ifndef WINAPI_FAMILY_PARTITION
+#define WINAPI_PARTITION_DESKTOP 1
+#define WINAPI_FAMILY_PARTITION(x) x
+#endif
+
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define USE_CREATE_THREAD
+#endif
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+// _beginthreadex requires __stdcall
+#if defined(__GNUC__) && \
+ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
+#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
+#else
+#define THREADFN unsigned int __stdcall
+#endif
+#define THREAD_EXIT_SUCCESS 0
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+ unsigned int(__stdcall *start)(void *),
+ void *arg) {
+ (void)attr;
+#ifdef USE_CREATE_THREAD
+ *thread = CreateThread(NULL, /* lpThreadAttributes */
+ 0, /* dwStackSize */
+ start, arg, 0, /* dwStackSize */
+ NULL); /* lpThreadId */
+#else
+ *thread = (pthread_t)_beginthreadex(NULL, /* void *security */
+ 0, /* unsigned stack_size */
+ start, arg, 0, /* unsigned initflag */
+ NULL); /* unsigned *thrdaddr */
+#endif
+ if (*thread == NULL) return 1;
+ SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+ return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+ (void)value_ptr;
+ return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) !=
+ WAIT_OBJECT_0 ||
+ CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+ void *mutexattr) {
+ (void)mutexattr;
+ InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
+ return 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+ return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+ EnterCriticalSection(mutex);
+ return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+ LeaveCriticalSection(mutex);
+ return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+ DeleteCriticalSection(mutex);
+ return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+ (void)condition;
+ return 0;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+ void *cond_attr) {
+ (void)cond_attr;
+ InitializeConditionVariable(condition);
+ return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+ WakeConditionVariable(condition);
+ return 0;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+ WakeAllConditionVariable(condition);
+ return 0;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+ pthread_mutex_t *const mutex) {
+ int ok;
+ ok = SleepConditionVariableCS(condition, mutex, INFINITE);
+ return !ok;
+}
+#else // _WIN32
+#include <pthread.h> // NOLINT
+#define THREADFN void *
+#define THREAD_EXIT_SUCCESS NULL
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // CONFIG_MULTITHREAD
+
+#endif // VPX_VPX_UTIL_VPX_PTHREAD_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.c b/media/libvpx/libvpx/vpx_util/vpx_thread.c
index 04c5fb6f26..0d0e2f5766 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_thread.c
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.c
@@ -12,10 +12,18 @@
// Original source:
// https://chromium.googlesource.com/webm/libwebp
+// Enable GNU extensions in glibc so that we can call pthread_setname_np().
+// This must be before any #include statements.
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
#include <assert.h>
#include <string.h> // for memset()
+#include "./vpx_config.h"
#include "./vpx_thread.h"
#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
#if CONFIG_MULTITHREAD
@@ -31,23 +39,54 @@ static void execute(VPxWorker *const worker); // Forward declaration.
static THREADFN thread_loop(void *ptr) {
VPxWorker *const worker = (VPxWorker *)ptr;
- int done = 0;
- while (!done) {
- pthread_mutex_lock(&worker->impl_->mutex_);
- while (worker->status_ == OK) { // wait in idling mode
+#ifdef __APPLE__
+ if (worker->thread_name != NULL) {
+ // Apple's version of pthread_setname_np takes one argument and operates on
+ // the current thread only. The maximum size of the thread_name buffer was
+ // noted in the Chromium source code and was confirmed by experiments. If
+ // thread_name is too long, pthread_setname_np returns -1 with errno
+ // ENAMETOOLONG (63).
+ char thread_name[64];
+ strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+ thread_name[sizeof(thread_name) - 1] = '\0';
+ pthread_setname_np(thread_name);
+ }
+#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__)
+ if (worker->thread_name != NULL) {
+ // Linux and Android require names (with nul) fit in 16 chars, otherwise
+ // pthread_setname_np() returns ERANGE (34).
+ char thread_name[16];
+ strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+ thread_name[sizeof(thread_name) - 1] = '\0';
+ pthread_setname_np(pthread_self(), thread_name);
+ }
+#endif
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ for (;;) {
+ while (worker->status_ == VPX_WORKER_STATUS_OK) { // wait in idling mode
pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
}
- if (worker->status_ == WORK) {
+ if (worker->status_ == VPX_WORKER_STATUS_WORKING) {
+ // When worker->status_ is VPX_WORKER_STATUS_WORKING, the main thread
+ // doesn't change worker->status_ and will wait until the worker changes
+ // worker->status_ to VPX_WORKER_STATUS_OK. See change_state(). So the
+ // worker can safely call execute() without holding worker->impl_->mutex_.
+ // When the worker reacquires worker->impl_->mutex_, worker->status_ must
+ // still be VPX_WORKER_STATUS_WORKING.
+ pthread_mutex_unlock(&worker->impl_->mutex_);
execute(worker);
- worker->status_ = OK;
- } else if (worker->status_ == NOT_OK) { // finish the worker
- done = 1;
+ pthread_mutex_lock(&worker->impl_->mutex_);
+ assert(worker->status_ == VPX_WORKER_STATUS_WORKING);
+ worker->status_ = VPX_WORKER_STATUS_OK;
+ // signal to the main thread that we're done (for sync())
+ pthread_cond_signal(&worker->impl_->condition_);
+ } else {
+ assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK); // finish the worker
+ break;
}
- // signal to the main thread that we're done (for sync())
- pthread_cond_signal(&worker->impl_->condition_);
- pthread_mutex_unlock(&worker->impl_->mutex_);
}
- return THREAD_RETURN(NULL); // Thread is finished
+ pthread_mutex_unlock(&worker->impl_->mutex_);
+ return THREAD_EXIT_SUCCESS; // Thread is finished
}
// main thread state control
@@ -58,13 +97,13 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
if (worker->impl_ == NULL) return;
pthread_mutex_lock(&worker->impl_->mutex_);
- if (worker->status_ >= OK) {
+ if (worker->status_ >= VPX_WORKER_STATUS_OK) {
// wait for the worker to finish
- while (worker->status_ != OK) {
+ while (worker->status_ != VPX_WORKER_STATUS_OK) {
pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
}
// assign new status and release the working thread if needed
- if (new_status != OK) {
+ if (new_status != VPX_WORKER_STATUS_OK) {
worker->status_ = new_status;
pthread_cond_signal(&worker->impl_->condition_);
}
@@ -78,21 +117,21 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
static void init(VPxWorker *const worker) {
memset(worker, 0, sizeof(*worker));
- worker->status_ = NOT_OK;
+ worker->status_ = VPX_WORKER_STATUS_NOT_OK;
}
static int sync(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
- change_state(worker, OK);
+ change_state(worker, VPX_WORKER_STATUS_OK);
#endif
- assert(worker->status_ <= OK);
+ assert(worker->status_ <= VPX_WORKER_STATUS_OK);
return !worker->had_error;
}
static int reset(VPxWorker *const worker) {
int ok = 1;
worker->had_error = 0;
- if (worker->status_ < OK) {
+ if (worker->status_ < VPX_WORKER_STATUS_OK) {
#if CONFIG_MULTITHREAD
worker->impl_ = (VPxWorkerImpl *)vpx_calloc(1, sizeof(*worker->impl_));
if (worker->impl_ == NULL) {
@@ -107,7 +146,7 @@ static int reset(VPxWorker *const worker) {
}
pthread_mutex_lock(&worker->impl_->mutex_);
ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
- if (ok) worker->status_ = OK;
+ if (ok) worker->status_ = VPX_WORKER_STATUS_OK;
pthread_mutex_unlock(&worker->impl_->mutex_);
if (!ok) {
pthread_mutex_destroy(&worker->impl_->mutex_);
@@ -118,12 +157,12 @@ static int reset(VPxWorker *const worker) {
return 0;
}
#else
- worker->status_ = OK;
+ worker->status_ = VPX_WORKER_STATUS_OK;
#endif
- } else if (worker->status_ > OK) {
+ } else if (worker->status_ > VPX_WORKER_STATUS_OK) {
ok = sync(worker);
}
- assert(!ok || (worker->status_ == OK));
+ assert(!ok || (worker->status_ == VPX_WORKER_STATUS_OK));
return ok;
}
@@ -135,7 +174,7 @@ static void execute(VPxWorker *const worker) {
static void launch(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
- change_state(worker, WORK);
+ change_state(worker, VPX_WORKER_STATUS_WORKING);
#else
execute(worker);
#endif
@@ -144,7 +183,7 @@ static void launch(VPxWorker *const worker) {
static void end(VPxWorker *const worker) {
#if CONFIG_MULTITHREAD
if (worker->impl_ != NULL) {
- change_state(worker, NOT_OK);
+ change_state(worker, VPX_WORKER_STATUS_NOT_OK);
pthread_join(worker->impl_->thread_, NULL);
pthread_mutex_destroy(&worker->impl_->mutex_);
pthread_cond_destroy(&worker->impl_->condition_);
@@ -152,10 +191,10 @@ static void end(VPxWorker *const worker) {
worker->impl_ = NULL;
}
#else
- worker->status_ = NOT_OK;
+ worker->status_ = VPX_WORKER_STATUS_NOT_OK;
assert(worker->impl_ == NULL);
#endif
- assert(worker->status_ == NOT_OK);
+ assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK);
}
//------------------------------------------------------------------------------
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.h b/media/libvpx/libvpx/vpx_util/vpx_thread.h
index 6d308e949b..11a1d74387 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_thread.h
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.h
@@ -15,370 +15,22 @@
#ifndef VPX_VPX_UTIL_VPX_THREAD_H_
#define VPX_VPX_UTIL_VPX_THREAD_H_
-#include "./vpx_config.h"
-
#ifdef __cplusplus
extern "C" {
#endif
-// Set maximum decode threads to be 8 due to the limit of frame buffers
-// and not enough semaphores in the emulation layer on windows.
-#define MAX_DECODE_THREADS 8
-
-#if CONFIG_MULTITHREAD
-
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-#include <errno.h> // NOLINT
-#include <process.h> // NOLINT
-#include <windows.h> // NOLINT
-typedef HANDLE pthread_t;
-typedef CRITICAL_SECTION pthread_mutex_t;
-
-#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater
-#define USE_WINDOWS_CONDITION_VARIABLE
-typedef CONDITION_VARIABLE pthread_cond_t;
-#else
-typedef struct {
- HANDLE waiting_sem_;
- HANDLE received_sem_;
- HANDLE signal_event_;
-} pthread_cond_t;
-#endif // _WIN32_WINNT >= 0x600
-
-#ifndef WINAPI_FAMILY_PARTITION
-#define WINAPI_PARTITION_DESKTOP 1
-#define WINAPI_FAMILY_PARTITION(x) x
-#endif
-
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define USE_CREATE_THREAD
-#endif
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-// _beginthreadex requires __stdcall
-#if defined(__GNUC__) && \
- (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
-#else
-#define THREADFN unsigned int __stdcall
-#endif
-#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
-
-#if _WIN32_WINNT >= 0x0501 // Windows XP or greater
-#define WaitForSingleObject(obj, timeout) \
- WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/)
-#endif
-
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
- unsigned int(__stdcall *start)(void *),
- void *arg) {
- (void)attr;
-#ifdef USE_CREATE_THREAD
- *thread = CreateThread(NULL, /* lpThreadAttributes */
- 0, /* dwStackSize */
- start, arg, 0, /* dwStackSize */
- NULL); /* lpThreadId */
-#else
- *thread = (pthread_t)_beginthreadex(NULL, /* void *security */
- 0, /* unsigned stack_size */
- start, arg, 0, /* unsigned initflag */
- NULL); /* unsigned *thrdaddr */
-#endif
- if (*thread == NULL) return 1;
- SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
- return 0;
-}
-
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
- (void)value_ptr;
- return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
- CloseHandle(thread) == 0);
-}
-
-// Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
- void *mutexattr) {
- (void)mutexattr;
-#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater
- InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
-#else
- InitializeCriticalSection(mutex);
-#endif
- return 0;
-}
-
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
- return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
-}
-
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
- EnterCriticalSection(mutex);
- return 0;
-}
-
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
- LeaveCriticalSection(mutex);
- return 0;
-}
-
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
- DeleteCriticalSection(mutex);
- return 0;
-}
-
-// Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
- int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
- (void)condition;
-#else
- ok &= (CloseHandle(condition->waiting_sem_) != 0);
- ok &= (CloseHandle(condition->received_sem_) != 0);
- ok &= (CloseHandle(condition->signal_event_) != 0);
-#endif
- return !ok;
-}
-
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
- void *cond_attr) {
- (void)cond_attr;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
- InitializeConditionVariable(condition);
-#else
- condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
- condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
- condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
- if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL ||
- condition->signal_event_ == NULL) {
- pthread_cond_destroy(condition);
- return 1;
- }
-#endif
- return 0;
-}
-
-static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
- int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
- WakeAllConditionVariable(condition);
-#else
- while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
- // a thread is waiting in pthread_cond_wait: allow it to be notified
- ok &= SetEvent(condition->signal_event_);
- // wait until the event is consumed so the signaler cannot consume
- // the event via its own pthread_cond_wait.
- ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
- WAIT_OBJECT_0);
- }
-#endif
- return !ok;
-}
-
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
- int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
- WakeConditionVariable(condition);
-#else
- if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
- // a thread is waiting in pthread_cond_wait: allow it to be notified
- ok = SetEvent(condition->signal_event_);
- // wait until the event is consumed so the signaler cannot consume
- // the event via its own pthread_cond_wait.
- ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
- WAIT_OBJECT_0);
- }
-#endif
- return !ok;
-}
-
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
- pthread_mutex_t *const mutex) {
- int ok;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
- ok = SleepConditionVariableCS(condition, mutex, INFINITE);
-#else
- // note that there is a consumer available so the signal isn't dropped in
- // pthread_cond_signal
- if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
- // now unlock the mutex so pthread_cond_signal may be issued
- pthread_mutex_unlock(mutex);
- ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
- WAIT_OBJECT_0);
- ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
- pthread_mutex_lock(mutex);
-#endif
- return !ok;
-}
-
-#elif defined(__OS2__)
-#define INCL_DOS
-#include <os2.h> // NOLINT
-
-#include <errno.h> // NOLINT
-#include <stdlib.h> // NOLINT
-#include <sys/builtin.h> // NOLINT
-
-#if defined(__STRICT_ANSI__)
-// _beginthread() is not declared on __STRICT_ANSI__ mode. Declare here.
-int _beginthread(void (*)(void *), void *, unsigned, void *);
-#endif
-
-#define pthread_t TID
-#define pthread_mutex_t HMTX
-
-typedef struct {
- HEV event_sem_;
- HEV ack_sem_;
- volatile unsigned wait_count_;
-} pthread_cond_t;
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-#define THREADFN void *
-#define THREAD_RETURN(val) (val)
-
-typedef struct {
- void *(*start_)(void *);
- void *arg_;
-} thread_arg;
-
-static void thread_start(void *arg) {
- thread_arg targ = *(thread_arg *)arg;
- free(arg);
-
- targ.start_(targ.arg_);
-}
-
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
- void *(*start)(void *), void *arg) {
- int tid;
- thread_arg *targ = (thread_arg *)malloc(sizeof(*targ));
- if (targ == NULL) return 1;
-
- (void)attr;
-
- targ->start_ = start;
- targ->arg_ = arg;
- tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ);
- if (tid == -1) {
- free(targ);
- return 1;
- }
-
- *thread = tid;
- return 0;
-}
-
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
- (void)value_ptr;
- return DosWaitThread(&thread, DCWW_WAIT) != 0;
-}
-
-// Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
- void *mutexattr) {
- (void)mutexattr;
- return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0;
-}
-
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
- return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY;
-}
-
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
- return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0;
-}
-
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
- return DosReleaseMutexSem(*mutex) != 0;
-}
-
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
- return DosCloseMutexSem(*mutex) != 0;
-}
-
-// Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
- int ok = 1;
- ok &= DosCloseEventSem(condition->event_sem_) == 0;
- ok &= DosCloseEventSem(condition->ack_sem_) == 0;
- return !ok;
-}
-
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
- void *cond_attr) {
- int ok = 1;
- (void)cond_attr;
-
- ok &=
- DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0;
- ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0;
- if (!ok) {
- pthread_cond_destroy(condition);
- return 1;
- }
- condition->wait_count_ = 0;
- return 0;
-}
-
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
- int ok = 1;
-
- if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) {
- ok &= DosPostEventSem(condition->event_sem_) == 0;
- ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0;
- }
-
- return !ok;
-}
-
-static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
- int ok = 1;
-
- while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0))
- ok &= pthread_cond_signal(condition) == 0;
-
- return !ok;
-}
-
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
- pthread_mutex_t *const mutex) {
- int ok = 1;
-
- __atomic_increment(&condition->wait_count_);
-
- ok &= pthread_mutex_unlock(mutex) == 0;
-
- ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0;
-
- __atomic_decrement(&condition->wait_count_);
-
- ok &= DosPostEventSem(condition->ack_sem_) == 0;
-
- pthread_mutex_lock(mutex);
-
- return !ok;
-}
-#else // _WIN32
-#include <pthread.h> // NOLINT
-#define THREADFN void *
-#define THREAD_RETURN(val) val
-#endif
-
-#endif // CONFIG_MULTITHREAD
+#define MAX_NUM_THREADS 64
// State of the worker thread object
typedef enum {
- NOT_OK = 0, // object is unusable
- OK, // ready to work
- WORK // busy finishing the current task
+ VPX_WORKER_STATUS_NOT_OK = 0, // object is unusable
+ VPX_WORKER_STATUS_OK, // ready to work
+ VPX_WORKER_STATUS_WORKING // busy finishing the current task
} VPxWorkerStatus;
// Function to be called by the worker thread. Takes two opaque pointers as
-// arguments (data1 and data2), and should return false in case of error.
+// arguments (data1 and data2). Should return true on success and return false
+// in case of error.
typedef int (*VPxWorkerHook)(void *, void *);
// Platform-dependent implementation details for the worker.
@@ -388,10 +40,14 @@ typedef struct VPxWorkerImpl VPxWorkerImpl;
typedef struct {
VPxWorkerImpl *impl_;
VPxWorkerStatus status_;
+ // Thread name for the debugger. If not NULL, must point to a string that
+ // outlives the worker thread. For portability, use a name <= 15 characters
+ // long (not including the terminating NUL character).
+ const char *thread_name;
VPxWorkerHook hook; // hook to call
void *data1; // first argument passed to 'hook'
void *data2; // second argument passed to 'hook'
- int had_error; // return value of the last call to 'hook'
+ int had_error; // true if a call to 'hook' returned false
} VPxWorker;
// The interface for all thread-worker related functions. All these functions
diff --git a/media/libvpx/libvpx/vpx_util/vpx_util.mk b/media/libvpx/libvpx/vpx_util/vpx_util.mk
index 1162714956..948e6d6f89 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_util.mk
+++ b/media/libvpx/libvpx/vpx_util/vpx_util.mk
@@ -10,6 +10,7 @@
UTIL_SRCS-yes += vpx_atomics.h
UTIL_SRCS-yes += vpx_util.mk
+UTIL_SRCS-yes += vpx_pthread.h
UTIL_SRCS-yes += vpx_thread.c
UTIL_SRCS-yes += vpx_thread.h
UTIL_SRCS-yes += endian_inl.h